Merge pull request #3019 from stanmoore1/kk_update_3.5.0

Update Kokkos library in LAMMPS to v3.5.0
2021-11-30 16:58:38 -05:00
parent 1a4511bb8d b3d7904120
commit 8589ecd6c1
399 changed files with 22059 additions and 8590 deletions
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -39,8 +39,8 @@ if(DOWNLOAD_KOKKOS)
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
  list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
  include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "4c84698917c93a18985b311bb6caf84f" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.5.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "079323d973ae0e1c38c0a54a150c674e" CACHE STRING "MD5 checksum of KOKKOS tarball")
  mark_as_advanced(KOKKOS_URL)
  mark_as_advanced(KOKKOS_MD5)
  ExternalProject_Add(kokkos_build
@ -60,7 +60,7 @@ if(DOWNLOAD_KOKKOS)
  target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS)
  add_dependencies(LAMMPS::KOKKOS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.4.01 REQUIRED CONFIG)
+  find_package(Kokkos 3.5.00 REQUIRED CONFIG)
  target_link_libraries(lammps PRIVATE Kokkos::kokkos)
  target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -560,11 +560,26 @@ They must be specified in uppercase.
   *  - VEGA908
      - GPU
      - AMD GPU MI100 GFX908
-   *  - INTEL_GEN
+   *  - VEGA90A
      - GPU
-      - Intel GPUs Gen9+
+      - AMD GPU
+   *  - INTEL_DG1
+      - GPU
+      - Intel Iris XeMAX GPU
+   *  - INTEL_GEN9
+      - GPU
+      - Intel GPU Gen9
+   *  - INTEL_GEN11
+      - GPU
+      - Intel GPU Gen11
+   *  - INTEL_GEN12LP
+      - GPU
+      - Intel GPU Gen12LP
+   *  - INTEL_XEHP
+      - GPU
+      - Intel GPUs Xe-HP

-This list was last updated for version 3.4.1 of the Kokkos library.
+This list was last updated for version 3.5.0 of the Kokkos library.

 .. tabs::

--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,165 @@
 # Change Log

+## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
+
+### Features:
+
+- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
+- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
+- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
+- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
+- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
+- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
+- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
+- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
+- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
+- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
+- Significant SYCL enhancements (see below)
+
+### Deprecations:
+
+- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
+- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
+- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
+- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
+- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
+- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
+
+
+### Backends and Archs Enhancements:
+
+- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
+- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
+- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
+- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
+- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
+- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
+- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
+- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
+
+#### SYCL:
+- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
+- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
+- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
+- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
+- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
+- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
+- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
+- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
+- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
+- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
+- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
+- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
+- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
+
+#### CUDA:
+- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
+- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
+- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
+- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
+- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
+- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
+- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
+- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
+- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
+
+#### HIP:
+- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
+- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
+- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
+
+#### Serial:
+- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
+
+#### OpenMPTarget:
+- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
+- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
+- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
+- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
+
+### Implemented enhancements BuildSystem
+
+#### Important BuildSystem Updates:
+- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
+- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
+
+#### Other Improvements:
+- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
+- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
+- Adds support for -time=<file> and -time <file> in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
+- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
+- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
+- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
+- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
+- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
+- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
+- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
+- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
+- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
+- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
+- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
+- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
+- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
+- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
+
+### Other Changes:
+
+#### Tool Enhancements:
+
+- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
+- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
+- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
+
+
+#### Miscellaneous:
+
+- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
+- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
+- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
+- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
+- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
+
+### Incompatibilities:
+
+- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
+
+### Bug Fixes:
+- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
+- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
+- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
+- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
+- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
+- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
+- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
+- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
+- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
+- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
+- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
+- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
+- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
+- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
+- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
+- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
+- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
+- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
+- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
+- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
+- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
+- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
+- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
+- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
+- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
+- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
+- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
+- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
+- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
+- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
+- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
+- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
+- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
+
 ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -111,8 +111,8 @@ ENDIF()


 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 5)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS)
  # which needs another workaround.
  SET(KOKKOS_COMPILE_OPTIONS_TMP)
  FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
-    LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
+    IF(OPTION_HAS_WHITESPACE EQUAL -1)
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
+    ELSE()
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
+    ENDIF()
  ENDFOREACH()
  STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
  LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -11,20 +11,21 @@ CXXFLAGS += $(SHFLAGS)
 endif

 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 5
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

-# Options: Cuda,HIP,OpenMP,Pthread,Serial
+# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthread"
-# Options: 
+# Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega900,Vega906,Vega908
+# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
+# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@ -32,7 +33,7 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++14"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings
 KOKKOS_OPTIONS ?= ""
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
@ -80,7 +81,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper

 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
-KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
@ -92,6 +93,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
+KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)

 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)

@ -112,6 +116,7 @@ endif
 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
 KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
+KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)

 KOKKOS_DEVICELIST =
@ -133,11 +138,18 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  KOKKOS_DEVICELIST += HIP
 endif
+KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  KOKKOS_DEVICELIST += SYCL
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error SYCL backend requires C++17 or newer)
+  endif
+
+endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  KOKKOS_DEVICELIST += OPENMPTARGET
-  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
    $(error OpenMPTarget backend requires C++17 or newer)
  endif
@ -168,6 +180,8 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_CRAY_CLANG  := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
+KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
@ -247,7 +261,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
  KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    else
    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    endif
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
      KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@ -259,7 +277,11 @@ else
          # OpenMP is turned on by default in Cray compiler environment.
          KOKKOS_INTERNAL_OPENMP_FLAG :=
        else
-          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
+          else
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          endif
        endif
      endif
    endif
@ -317,6 +339,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)

+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
@ -384,20 +413,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
-KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
+    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+  endif
+endif
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
 KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
+KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)

 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))

 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@ -406,7 +440,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))

 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc )
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)

 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@ -442,6 +476,10 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =

+#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  KOKKOS_LIBS += -latomic
+#endif
+
 # Generating the KokkosCore_config.h file.

 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@ -478,6 +516,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
  ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
@ -533,6 +575,12 @@ endif

 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
@ -635,8 +683,10 @@ endif

 tmp := $(call kokkos_append_header,"/* Optimization Settings */")

-ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
+  # deprecated
  tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
 endif

 tmp := $(call kokkos_append_header,"/* Cuda Settings */")
@ -1166,6 +1216,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+  endif


  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
@ -1184,6 +1239,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
  endif
 endif

+# Figure out the architecture flag for SYCL.
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
+  endif
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
+
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=-fsycl
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+endif

 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)

@ -1196,56 +1297,62 @@ endif
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
  tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)

-# Functions for generating config header file
-kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
-kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
-kokkos_append_config_header = $(shell echo $1 >> $2))
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
-tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
-   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-   else
-   endif
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
+  # Functions for generating config header file
+  kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
+  kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
+  kokkos_append_config_header = $(shell echo $1 >> $2))
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    else
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
 endif
+
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
@ -1257,6 +1364,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+  endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
  ifneq ($(CUDA_PATH),)
    KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides
 abstractions for both parallel execution of code and data management.
 Kokkos is designed to target complex node architectures with N-level memory
 hierarchies and multiple types of execution resources. It currently can use
-CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
+CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.

 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).

 # Learning about Kokkos

-A programming guide can be found on the Wiki, the API reference is under development.
+The best way to start learning about Kokkos is going through the Kokkos Lectures.
+They are online available at https://kokkos.link/the-lectures and contain a mix
+of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
+capabilities.
+
+A programming guide and API reference can be found on the Wiki
+(https://github.com/kokkos/kokkos/wiki).

 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.

 For non-public questions send an email to
 crtrott(at)sandia.gov

-A separate repository with extensive tutorial material can be found under
-https://github.com/kokkos/kokkos-tutorials.
-
-Furthermore, the 'example/tutorial' directory provides step by step tutorial
-examples which explain many of the features of Kokkos. They work with
-simple Makefiles. To build with g++ and OpenMP simply type 'make'
-in the 'example/tutorial' directory. This will build all examples in the
-subfolders. To change the build options refer to the Programming Guide
-in the compilation section.
-
-To learn more about Kokkos consider watching one of our presentations:
-* GTC 2015:
-  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
-  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
-
-
 # Contributing to Kokkos

 We are open and try to encourage contributions from external developers.
@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution.

 # Requirements

-### Primary tested compilers on X86 are:
-* GCC 5.3.0
-* GCC 5.4.0
-* GCC 5.5.0
-* GCC 6.1.0
-* GCC 7.2.0
-* GCC 7.3.0
-* GCC 8.1.0
-* Intel 17.0.1
-* Intel 17.4.196
-* Intel 18.2.128
-* Clang 4.0.0
-* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
-* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
-* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2)
-* PGI 18.7
-* NVCC 9.1 for CUDA (with gcc 6.1.0)
-* NVCC 9.2 for CUDA (with gcc 7.2.0)
-* NVCC 10.0 for CUDA (with gcc 7.4.0)
-* NVCC 10.1 for CUDA (with gcc 7.4.0)
-* NVCC 11.0 for CUDA (with gcc 8.4.0)
+### Minimum Compiler Versions

-### Primary tested compilers on Power 8 are:
-* GCC 6.4.0 (OpenMP,Serial)
-* GCC 7.2.0 (OpenMP,Serial)
-* IBM XL 16.1.0 (OpenMP, Serial)
-* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
+Generally Kokkos should work with all compiler versions newer than the minimum.
+However as in all sufficiently complex enough code, we have to work around compiler
+bugs with almost all compilers. So compiler versions we don't test may have issues
+we are unaware off.

-### Primary tested compilers on Intel KNL are:
-* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0)
-* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0)
+* GCC: 5.3.0
+* Clang: 4.0.0
+* Intel: 17.0.1
+* NVCC: 9.2.88
+* NVC++: 21.5
+* ROCM: 4.3
+* MSVC: 19.29
+* IBM XL: 16.1.1
+* Fujitsu: 4.5.0
+* ARM/Clang 20.1

-### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0
-* ARM/Clang 18.4.0
+### Primary Tested Compilers

-### Other compilers working:
-* X86:
-    * Cygwin 2.1.0 64bit with gcc 4.9.3
-    * GCC 8.1.0 (not warning free)
-
-### Known non-working combinations:
-* Power8:
-    * Pthreads backend
-* ARM
-    * Pthreads backend
+* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
+* NVCC: 9.2.88, 10.1, 11.0
+* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
+* Intel 17.4, 18.1, 19.5
+* MSVC: 19.29
+* ARM/Clang: 20.1
+* IBM XL: 16.1.1
+* ROCM: 4.3.0

 ### Build system:
-* CMake >= 3.10: required
-* CMake >= 3.13: recommended
+
+* CMake >= 3.16: required
 * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
+* CMake >= 3.21.1 for NVC++

 Primary tested compiler are passing in release mode
 with warnings as errors. They also are tested with a comprehensive set of
@ -153,7 +126,6 @@ cmake $srcdir \
  -DCMAKE_INSTALL_PREFIX=$path_to_install \
  -DKokkos_ENABLE_OPENMP=On \
  -DKokkos_ARCH_HSW=On \
-  -DKokkos_ENABLE_HWLOC=On \
  -DKokkos_HWLOC_DIR=$path_to_hwloc
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a
 Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).

 ## Raw Makefile
-A bash script is provided to generate raw makefiles.
-To install Kokkos as a library create a build directory and run the following
-````bash
-> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
-````
-Once the Makefile is generated, run:
-````bash
-> make kokkoslib
-> make install
-````
-To additionally run the unit tests:
-````bash
-> make build-test
-> make test
-````
-Run `generate_makefile.bash --help` for more detailed options such as
-changing the device type for which to build.
+
+Raw Makefiles are only supported via inline builds. See below.

 ## Inline Builds vs. Installed Package
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
@ -268,6 +225,35 @@ more than a single GPU is used by a single process.

 If you publish work which mentions Kokkos, please cite the following paper:

+````BibTex
+@ARTICLE{9485033,
+  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
+  year={2022},
+  volume={33},
+  number={4},
+  pages={805-817},
+  doi={10.1109/TPDS.2021.3097283}}
+````
+
+If you use more than one Kokkos EcoSystem package, please also cite:
+
+````BibTex
+@ARTICLE{9502936,
+  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
+  journal={Computing in Science   Engineering},
+  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
+  year={2021},
+  volume={23},
+  number={5},
+  pages={10-18},
+  doi={10.1109/MCSE.2021.3098509}}
+````
+
+
+And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
+
 ````BibTeX
@article{CarterEdwards20143202,
  title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
--- a/lib/kokkos/algorithms/CMakeLists.txt
+++ b/lib/kokkos/algorithms/CMakeLists.txt
@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
  ADD_SUBDIRECTORY(src)
 ENDIF()
-IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
-        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
-             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
  KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()

--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
  }
 };

+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <class Generator>
+struct rand<Generator, Kokkos::Experimental::half_t> {
+  using half = Kokkos::Experimental::half_t;
+  KOKKOS_INLINE_FUNCTION
+  static half max() { return half(1.0); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen) { return half(gen.frand()); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& range) {
+    return half(gen.frand(float(range)));
+  }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& start, const half& end) {
+    return half(gen.frand(float(start), float(end)));
+  }
+};
+#endif  // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
 template <class Generator>
 struct rand<Generator, float> {
  KOKKOS_INLINE_FUNCTION
@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>

 template <class ExecutionSpace>
 struct Random_UniqueIndex {
-  using locks_view_type = View<int*, ExecutionSpace>;
+  using locks_view_type = View<int**, ExecutionSpace>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@ -615,7 +634,7 @@ struct Random_UniqueIndex {
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
 struct Random_UniqueIndex<Kokkos::Cuda> {
-  using locks_view_type = View<int*, Kokkos::Cuda>;
+  using locks_view_type = View<int**, Kokkos::Cuda>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __CUDA_ARCH__
@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
                 blockDim.x * blockDim.y * blockDim.z +
             i_offset) %
            locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
      i += blockDim.x * blockDim.y * blockDim.z;
      if (i >= static_cast<int>(locks_.extent(0))) {
        i = i_offset;
@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_HIP
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
-  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
                 blockDim.x * blockDim.y * blockDim.z +
             i_offset) %
            locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
      i += blockDim.x * blockDim.y * blockDim.z;
      if (i >= static_cast<int>(locks_.extent(0))) {
        i = i_offset;
@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 #ifdef KOKKOS_ENABLE_SYCL
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
-  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type& locks_) {
-#ifdef KOKKOS_ARCH_INTEL_GEN
+#ifdef KOKKOS_ARCH_INTEL_GPU
    int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
 #else
    int i = 0;
 #endif
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
      i = (i + 1) % static_cast<int>(locks_.extent(0));
    }
    return i;
@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
-  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
  KOKKOS_FUNCTION
  static int get_state_idx(const locks_view_type& locks) {
    const int team_size = omp_get_num_threads();
    int i               = omp_get_team_num() * team_size + omp_get_thread_num();
    const int lock_size = locks.extent_int(0);

-    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
      i = (i + 1) % lock_size;
    }
    return i;
@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
 private:
  using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using state_data_type = View<uint64_t*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using state_data_type = View<uint64_t**, DeviceType>;
  locks_type locks_;
  state_data_type state_;
  int num_states_;
+  int padding_;

 public:
  using generator_type = Random_XorShift64<DeviceType>;
  using device_type    = DeviceType;

  KOKKOS_INLINE_FUNCTION
-  Random_XorShift64_Pool() { num_states_ = 0; }
+  Random_XorShift64_Pool() {
+    num_states_ = 0;
+    padding_    = 0;
+  }
  Random_XorShift64_Pool(uint64_t seed) {
    num_states_ = 0;

@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
    locks_      = src.locks_;
    state_      = src.state_;
    num_states_ = src.num_states_;
+    padding_    = src.padding_;
    return *this;
  }

  void init(uint64_t seed, int num_states) {
    if (seed == 0) seed = uint64_t(1318319);
-
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
    num_states_ = num_states;

-    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
-    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
+    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
+                             padding_);

    typename state_data_type::HostMirror h_state = create_mirror_view(state_);
    typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
        gen(seed, 0);
    for (int i = 0; i < 17; i++) gen.rand();
    for (int i = 0; i < num_states_; i++) {
-      int n1     = gen.rand();
-      int n2     = gen.rand();
-      int n3     = gen.rand();
-      int n4     = gen.rand();
-      h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
-                   (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
-                   (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
-                   (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
-      h_lock(i) = 0;
+      int n1        = gen.rand();
+      int n2        = gen.rand();
+      int n3        = gen.rand();
+      int n4        = gen.rand();
+      h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
+                      (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
+                      (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
+                      (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
+      h_lock(i, 0) = 0;
    }
    deep_copy(state_, h_state);
    deep_copy(locks_, h_lock);
@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
  Random_XorShift64<DeviceType> get_state() const {
    const int i =
        Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift64<DeviceType>(state_(i), i);
+    return Random_XorShift64<DeviceType>(state_(i, 0), i);
  }

  // NOTE: state_idx MUST be unique and less than num_states
  KOKKOS_INLINE_FUNCTION
  Random_XorShift64<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
+    return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
  }

  KOKKOS_INLINE_FUNCTION
  void free_state(const Random_XorShift64<DeviceType>& state) const {
-    state_(state.state_idx_) = state.state_;
-    locks_(state.state_idx_) = 0;
+    state_(state.state_idx_, 0) = state.state_;
+    locks_(state.state_idx_, 0) = 0;
  }
 };

@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
 private:
  using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using int_view_type   = View<int*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using int_view_type   = View<int**, DeviceType>;
  using state_data_type = View<uint64_t * [16], DeviceType>;

  locks_type locks_;
  state_data_type state_;
  int_view_type p_;
  int num_states_;
+  int padding_;
  friend class Random_XorShift1024<DeviceType>;

 public:
@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
    state_      = src.state_;
    p_          = src.p_;
    num_states_ = src.num_states_;
+    padding_    = src.padding_;
    return *this;
  }

  inline void init(uint64_t seed, int num_states) {
    if (seed == 0) seed = uint64_t(1318319);
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
    num_states_ = num_states;
-    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
    state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
-    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
+    p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);

    typename state_data_type::HostMirror h_state = create_mirror_view(state_);
    typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
                        (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
                        (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
      }
-      h_p(i)    = 0;
-      h_lock(i) = 0;
+      h_p(i, 0)    = 0;
+      h_lock(i, 0) = 0;
    }
    deep_copy(state_, h_state);
    deep_copy(locks_, h_lock);
@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
  Random_XorShift1024<DeviceType> get_state() const {
    const int i =
        Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
+    return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
  };

  // NOTE: state_idx MUST be unique and less than num_states
  KOKKOS_INLINE_FUNCTION
  Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
+    return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
  }

  KOKKOS_INLINE_FUNCTION
  void free_state(const Random_XorShift1024<DeviceType>& state) const {
    for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_)     = state.p_;
-    locks_(state.state_idx_) = 0;
+    p_(state.state_idx_, 0)     = state.p_;
+    locks_(state.state_idx_, 0) = 0;
  }
 };

--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -206,8 +206,10 @@ class BinSort {
  //----------------------------------------
  // Constructor: takes the keys, the binning_operator and optionally whether to
  // sort within bins (default false)
-  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
-          BinSortOp bin_op_, bool sort_within_bins_ = false)
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          int range_begin_, int range_end_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
      : keys(keys_),
        keys_rnd(keys_),
        bin_op(bin_op_),
@ -222,50 +224,63 @@ class BinSort {
        "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
    bin_count_const = bin_count_atomic;
    bin_offsets =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                               "Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
                    bin_op.max_bins());
    sort_order =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                               "Kokkos::SortImpl::BinSortFunctor::sort_order"),
                    range_end - range_begin);
  }

+  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
+                sort_within_bins_) {}
+
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+
  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
          bool sort_within_bins_ = false)
-      : BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+      : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}

  //----------------------------------------
  // Create the permutation vector, the bin_offset array and the bin_count
  // array. Can be called again if keys changed
-  void create_permute_vector() {
+  template <class ExecutionSpace = execution_space>
+  void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
    const size_t len = range_end - range_begin;
    Kokkos::parallel_for(
        "Kokkos::Sort::BinCount",
-        Kokkos::RangePolicy<execution_space, bin_count_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
+        *this);
    Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
-                          Kokkos::RangePolicy<execution_space, bin_offset_tag>(
-                              0, bin_op.max_bins()),
+                          Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
+                              exec, 0, bin_op.max_bins()),
                          *this);

-    Kokkos::deep_copy(bin_count_atomic, 0);
+    Kokkos::deep_copy(exec, bin_count_atomic, 0);
    Kokkos::parallel_for(
        "Kokkos::Sort::BinBinning",
-        Kokkos::RangePolicy<execution_space, bin_binning_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
+        *this);

    if (sort_within_bins)
      Kokkos::parallel_for(
          "Kokkos::Sort::BinSort",
-          Kokkos::RangePolicy<execution_space, bin_sort_bins_tag>(
-              0, bin_op.max_bins()),
+          Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
+              exec, 0, bin_op.max_bins()),
          *this);
  }

  // Sort a subset of a view with respect to the first dimension using the
  // permutation array
-  template <class ValuesViewType>
-  void sort(ValuesViewType const& values, int values_range_begin,
-            int values_range_end) const {
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
+            int values_range_begin, int values_range_end) const {
    using scratch_view_type =
        Kokkos::View<typename ValuesViewType::data_type,
                     typename ValuesViewType::array_layout,
@ -279,7 +294,7 @@ class BinSort {
    }

    scratch_view_type sorted_values(
-        view_alloc(WithoutInitializing,
+        view_alloc(exec, WithoutInitializing,
                   "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
        values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
        values.rank_dynamic > 1 ? values.extent(1)
@ -308,7 +323,7 @@ class BinSort {
                  values_range_begin - range_begin);

      parallel_for("Kokkos::Sort::CopyPermute",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
    }

    {
@ -316,10 +331,23 @@ class BinSort {
          values, range_begin, sorted_values);

      parallel_for("Kokkos::Sort::Copy",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
    }
+  }

-    execution_space().fence();
+  // Sort a subset of a view with respect to the first dimension using the
+  // permutation array
+  template <class ValuesViewType>
+  void sort(ValuesViewType const& values, int values_range_begin,
+            int values_range_end) const {
+    execution_space exec;
+    sort(exec, values, values_range_begin, values_range_end);
+    exec.fence("Kokkos::Sort: fence after sorting");
+  }
+
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
+    this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
  }

  template <class ValuesViewType>
@ -485,17 +513,19 @@ struct BinOp3D {

 namespace Impl {

-template <class ViewType>
-bool try_std_sort(ViewType view) {
+template <class ViewType, class ExecutionSpace>
+bool try_std_sort(ViewType view, const ExecutionSpace& exec) {
  bool possible    = true;
  size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(),
                      view.stride_3(), view.stride_4(), view.stride_5(),
                      view.stride_6(), view.stride_7()};
  possible         = possible &&
-             std::is_same<typename ViewType::memory_space, HostSpace>::value;
+             SpaceAccessibility<HostSpace,
+                                typename ViewType::memory_space>::accessible;
  possible = possible && (ViewType::Rank == 1);
  possible = possible && (stride[0] == 1);
  if (possible) {
+    exec.fence("Kokkos::sort: Fence before sorting on the host");
    std::sort(view.data(), view.data() + view.extent(0));
  }
  return possible;
@ -518,10 +548,12 @@ struct min_max_functor {

 }  // namespace Impl

-template <class ViewType>
-void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType const& view,
+    bool const always_use_kokkos_sort = false) {
  if (!always_use_kokkos_sort) {
-    if (Impl::try_std_sort(view)) return;
+    if (Impl::try_std_sort(view, exec)) return;
  }
  using CompType = BinOp1D<ViewType>;

@ -529,34 +561,50 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
  parallel_reduce("Kokkos::Sort::FindExtent",
                  Kokkos::RangePolicy<typename ViewType::execution_space>(
-                      0, view.extent(0)),
+                      exec, 0, view.extent(0)),
                  Impl::min_max_functor<ViewType>(view), reducer);
  if (result.min_val == result.max_val) return;
  BinSort<ViewType, CompType> bin_sort(
      view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true);
-  bin_sort.create_permute_vector();
-  bin_sort.sort(view);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view);
 }

 template <class ViewType>
-void sort(ViewType view, size_t const begin, size_t const end) {
+void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, always_use_kokkos_sort);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType view, size_t const begin,
+    size_t const end) {
  using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
  using CompType     = BinOp1D<ViewType>;

  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);

-  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end),
+  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
                  Impl::min_max_functor<ViewType>(view), reducer);

  if (result.min_val == result.max_val) return;

  BinSort<ViewType, CompType> bin_sort(
-      view, begin, end,
+      exec, view, begin, end,
      CompType((end - begin) / 2, result.min_val, result.max_val), true);

-  bin_sort.create_permute_vector();
-  bin_sort.sort(view, begin, end);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view, begin, end);
+}
+
+template <class ViewType>
+void sort(ViewType view, size_t const begin, size_t const end) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, begin, end);
+  exec.fence("Kokkos::Sort: fence after sorting");
 }

 }  // namespace Kokkos
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@ -47,7 +47,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
@ -198,11 +198,50 @@ struct test_random_functor {
          static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
      const uint64_t ind3_3d =
          static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
-
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+//
+// 17.0.4 for 64bit Random works with 1/1/1/2/1
+// 17.0.4 for 1024bit Random works with 1/1/1/1/1
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      atomic_fetch_add(&density_1d(ind1_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      atomic_fetch_add(&density_1d(ind2_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      atomic_fetch_add(&density_1d(ind3_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
+        asm volatile("nop\n");
+      asm volatile("nop\n");
+#endif
+#endif
      atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
    }
    rand_pool.free_state(rand_gen);
  }
@ -338,9 +377,11 @@ struct test_random_scalar {
      using functor_type =
          test_histogram1d_functor<typename RandomGenerator::device_type>;
      parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
-
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
+      double mean_eps_expect       = 0.0001;
+      double variance_eps_expect   = 0.07;
+      double covariance_eps_expect = 0.06;
+      double tolerance             = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect           = 1.0 * num_draws * 3 / HIST_DIM1D;
      double variance_expect =
          1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
      double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@ -349,11 +390,26 @@ struct test_random_scalar {
          variance_expect / (result.variance / HIST_DIM1D) - 1.0;
      double covariance_eps =
          (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
-      pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
-      pass_hist1d_var =
-          ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
-      pass_hist1d_covar =
-          ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        mean_eps_expect       = 0.0003;
+        variance_eps_expect   = 1.0;
+        covariance_eps_expect = 5.0e4;
+      }
+#endif
+
+      pass_hist1d_mean =
+          ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
+                                                                          : 0;
+      pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
+                         (variance_eps_expect > variance_eps))
+                            ? 1
+                            : 0;
+      pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
+                           (covariance_eps_expect > covariance_eps))
+                              ? 1
+                              : 0;

      cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
           << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@ -371,8 +427,9 @@ struct test_random_scalar {
          test_histogram3d_functor<typename RandomGenerator::device_type>;
      parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);

-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws / HIST_DIM1D;
+      double variance_factor = 1.2;
+      double tolerance       = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect     = 1.0 * num_draws / HIST_DIM1D;
      double variance_expect =
          1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
      double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@ -381,15 +438,23 @@ struct test_random_scalar {
          variance_expect / (result.variance / HIST_DIM1D) - 1.0;
      double covariance_eps =
          (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        variance_factor = 7;
+      }
+#endif
+
      pass_hist3d_mean =
          ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
-      pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
-                         (1.2 * tolerance > variance_eps))
+      pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
+                         (variance_factor * tolerance > variance_eps))
                            ? 1
                            : 0;
-      pass_hist3d_covar =
-          ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
-                                                                          : 0;
+      pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
+                           (variance_factor * tolerance > covariance_eps))
+                              ? 1
+                              : 0;

      cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
           << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
  deep_copy(density_1d, 0);
  deep_copy(density_3d, 0);

+  cout << "Test Scalar=half" << endl;
+  test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
+      density_1d, density_3d, pool, num_draws);
+  ASSERT_EQ(test_half.pass_mean, 1);
+  ASSERT_EQ(test_half.pass_var, 1);
+  ASSERT_EQ(test_half.pass_covar, 1);
+  ASSERT_EQ(test_half.pass_hist1d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist1d_var, 1);
+  ASSERT_EQ(test_half.pass_hist1d_covar, 1);
+  ASSERT_EQ(test_half.pass_hist3d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist3d_var, 1);
+  ASSERT_EQ(test_half.pass_hist3d_covar, 1);
+  deep_copy(density_1d, 0);
+  deep_copy(density_3d, 0);
+
  cout << "Test Scalar=float" << endl;
  test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
                                                        pool, num_draws);
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@ -135,8 +135,9 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
  KeyViewType keys("Keys", n);

  // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys, KeyType(1));
-  Kokkos::sort(keys, force_kokkos);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys, KeyType(1));
+  Kokkos::sort(exec, keys, force_kokkos);

  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
  Kokkos::fill_random(keys, g,
@ -147,13 +148,16 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
  double sum_after        = 0.0;
  unsigned int sort_fails = 0;

-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_before);

-  Kokkos::sort(keys, force_kokkos);
+  Kokkos::sort(exec, keys, force_kokkos);

-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys),
+                          sort_fails);

  double ratio   = sum_before / sum_after;
  double epsilon = 1e-10;
@ -177,8 +181,10 @@ void test_3D_sort_impl(unsigned int n) {
  double sum_after        = 0.0;
  unsigned int sort_fails = 0;

-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_before);
+  ExecutionSpace exec;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_before);

  int bin_1d = 1;
  while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
@ -189,15 +195,17 @@ void test_3D_sort_impl(unsigned int n) {
  using BinOp = Kokkos::BinOp3D<KeyViewType>;
  BinOp bin_op(bin_max, min, max);
  Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
-  Sorter.create_permute_vector();
-  Sorter.template sort<KeyViewType>(keys);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, keys);

-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_after);
-  Kokkos::parallel_reduce(keys.extent(0) - 1,
-                          bin3d_is_sorted_struct<ExecutionSpace, KeyType>(
-                              keys, bin_1d, min[0], max[0]),
-                          sort_fails);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
+      bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
+                                                      max[0]),
+      sort_fails);

  double ratio   = sum_before / sum_after;
  double epsilon = 1e-10;
@ -229,36 +237,36 @@ void test_dynamic_view_sort_impl(unsigned int n) {
  KeyViewType keys_view("KeysTmp", n);

  // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys_view, KeyType(1));
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys_view, KeyType(1));
  Kokkos::deep_copy(keys, keys_view);
-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);

  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
  Kokkos::fill_random(keys_view, g,
                      Kokkos::Random_XorShift64_Pool<
                          ExecutionSpace>::generator_type::MAX_URAND);

-  ExecutionSpace().fence();
+  exec.fence();
  Kokkos::deep_copy(keys, keys_view);
-  // ExecutionSpace().fence();

  double sum_before       = 0.0;
  double sum_after        = 0.0;
  unsigned int sort_fails = 0;

-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_before);

-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);

-  ExecutionSpace().fence();  // Need this fence to prevent BusError with Cuda
+  exec.fence();  // Need this fence to prevent BusError with Cuda
  Kokkos::deep_copy(keys_view, keys);
-  // ExecutionSpace().fence();

-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys_view), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys_view),
+                          sort_fails);

  double ratio   = sum_before / sum_after;
  double epsilon = 1e-10;
@ -301,9 +309,10 @@ void test_issue_1160_impl() {
  for (int i = 0; i < 10; ++i) {
    h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
  }
-  Kokkos::deep_copy(element_, h_element);
-  Kokkos::deep_copy(x_, h_x);
-  Kokkos::deep_copy(v_, h_v);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, element_, h_element);
+  Kokkos::deep_copy(exec, x_, h_x);
+  Kokkos::deep_copy(exec, v_, h_v);

  using KeyViewType = decltype(element_);
  using BinOp       = Kokkos::BinOp1D<KeyViewType>;
@ -316,15 +325,16 @@ void test_issue_1160_impl() {

  Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
                                             false);
-  Sorter.create_permute_vector();
-  Sorter.sort(element_, begin, end);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, element_, begin, end);

-  Sorter.sort(x_, begin, end);
-  Sorter.sort(v_, begin, end);
+  Sorter.sort(exec, x_, begin, end);
+  Sorter.sort(exec, v_, begin, end);

-  Kokkos::deep_copy(h_element, element_);
-  Kokkos::deep_copy(h_x, x_);
-  Kokkos::deep_copy(h_v, v_);
+  Kokkos::deep_copy(exec, h_element, element_);
+  Kokkos::deep_copy(exec, h_x, x_);
+  Kokkos::deep_copy(exec, h_v, v_);
+  exec.fence();

  ASSERT_EQ(h_element(0), 9);
  ASSERT_EQ(h_element(1), 8);
--- a/lib/kokkos/appveyor.yml
+++ b/lib/kokkos/appveyor.yml
@ -3,4 +3,8 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
+    mkdir build &&
+    cd build &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake --build . --target install &&
+    ctest -C Debug --output-on-failure
--- a/lib/kokkos/benchmarks/atomic/main.cpp
+++ b/lib/kokkos/benchmarks/atomic/main.cpp
@ -1,12 +1,12 @@
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Random.hpp>

 template <class Scalar>
 double test_atomic(int L, int N, int M, int K, int R,
                   Kokkos::View<const int*> offsets) {
  Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;

  for (int r = 0; r < R; r++)
    Kokkos::parallel_for(
@ -28,7 +28,7 @@ template <class Scalar>
 double test_no_atomic(int L, int N, int M, int K, int R,
                      Kokkos::View<const int*> offsets) {
  Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
  for (int r = 0; r < R; r++)
    Kokkos::parallel_for(
        L, KOKKOS_LAMBDA(const int& i) {
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
@ -43,7 +43,7 @@
 */

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 template <class Scalar, int Unroll, int Stride>
 struct Run {
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@ -43,7 +43,7 @@
 */

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <bench.hpp>
 #include <cstdlib>

--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@ -43,7 +43,7 @@
 */

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <gather.hpp>
 #include <cstdlib>

--- a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
+++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
@ -52,35 +52,33 @@

 #define HLINE "-------------------------------------------------------------\n"

-#if defined(KOKKOS_ENABLE_CUDA)
-using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
-#else
-using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
-#endif
+using StreamDeviceArray =
+    Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Restrict>>;
+using StreamHostArray = typename StreamDeviceArray::HostMirror;

 using StreamIndex = int;
+using Policy      = Kokkos::RangePolicy<Kokkos::IndexType<StreamIndex>>;

-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
-}
-
-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
-                  StreamDeviceArray& c) {
+void perform_set(StreamDeviceArray& a, const double scalar) {
  Kokkos::parallel_for(
-      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
+      "set", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; });

  Kokkos::fence();
 }

-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
-                   StreamDeviceArray& c, const double scalar) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) {
  Kokkos::parallel_for(
-      "copy", a.extent(0),
+      "copy", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; });
+
+  Kokkos::fence();
+}
+
+void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c,
+                   const double scalar) {
+  Kokkos::parallel_for(
+      "scale", Policy(0, b.extent(0)),
      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });

  Kokkos::fence();
@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
                 StreamDeviceArray& c) {
  Kokkos::parallel_for(
-      "add", a.extent(0),
+      "add", Policy(0, a.extent(0)),
      KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });

  Kokkos::fence();
@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
                   StreamDeviceArray& c, const double scalar) {
  Kokkos::parallel_for(
-      "triad", a.extent(0),
+      "triad", Policy(0, a.extent(0)),
      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });

  Kokkos::fence();
@ -184,6 +182,7 @@ int run_benchmark() {

  const double scalar = 3.0;

+  double setTime   = std::numeric_limits<double>::max();
  double copyTime  = std::numeric_limits<double>::max();
  double scaleTime = std::numeric_limits<double>::max();
  double addTime   = std::numeric_limits<double>::max();
@ -191,13 +190,10 @@ int run_benchmark() {

  printf("Initializing Views...\n");

-#if defined(KOKKOS_HAVE_OPENMP)
  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
-#else
-  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
-#endif
+      "init",
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,
+                                                             STREAM_ARRAY_SIZE),
      KOKKOS_LAMBDA(const int i) {
        a[i] = 1.0;
        b[i] = 2.0;
@ -209,26 +205,30 @@ int run_benchmark() {
  Kokkos::deep_copy(dev_b, b);
  Kokkos::deep_copy(dev_c, c);

-  double start;
-
  printf("Starting benchmarking...\n");

+  Kokkos::Timer timer;
+
  for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
-    start = now();
-    perform_copy(dev_a, dev_b, dev_c);
-    copyTime = std::min(copyTime, (now() - start));
+    timer.reset();
+    perform_set(dev_c, 1.5);
+    setTime = std::min(setTime, timer.seconds());

-    start = now();
-    perform_scale(dev_a, dev_b, dev_c, scalar);
-    scaleTime = std::min(scaleTime, (now() - start));
+    timer.reset();
+    perform_copy(dev_a, dev_c);
+    copyTime = std::min(copyTime, timer.seconds());

-    start = now();
+    timer.reset();
+    perform_scale(dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, timer.seconds());
+
+    timer.reset();
    perform_add(dev_a, dev_b, dev_c);
-    addTime = std::min(addTime, (now() - start));
+    addTime = std::min(addTime, timer.seconds());

-    start = now();
+    timer.reset();
    perform_triad(dev_a, dev_b, dev_c, scalar);
-    triadTime = std::min(triadTime, (now() - start));
+    triadTime = std::min(triadTime, timer.seconds());
  }

  Kokkos::deep_copy(a, dev_a);
@ -240,6 +240,9 @@ int run_benchmark() {

  printf(HLINE);

+  printf("Set             %11.2f MB/s\n",
+         (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             setTime);
  printf("Copy            %11.2f MB/s\n",
         (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
             copyTime);
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
  > ${HPCBIND_OUT}
  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    else
-      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    fi
  else
    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
    else
-      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
    fi
  fi
 fi
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -96,10 +96,10 @@ replace_pragma_ident=0
 first_xcompiler_arg=1

 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop)
-if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
+if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
  temp_dir=${TMPDIR:-/tmp}
 else
-  temp_dir=${NVCC_WRAPPER_TMPDIR+x}
+  temp_dir=${NVCC_WRAPPER_TMPDIR}
 fi

 # optimization flag added as a command-line argument
@ -226,14 +226,14 @@ do
    cuda_args="$cuda_args $1"
    ;;
  #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler)
    cuda_args="$cuda_args $1"
    ;;
  #Handle known nvcc args that have an argument
-  -maxrregcount=*|--maxrregcount=*)
+  -maxrregcount=*|--maxrregcount=*|-time=*)
    cuda_args="$cuda_args $1"
    ;;
-  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time)
    cuda_args="$cuda_args $1 $2"
    shift
    ;;
@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then
  $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
  if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command && $nvcc_depfile_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command"
  fi
-  $nvcc_command && $nvcc_depfile_command
+  TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command
 else
  if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command"
  fi
-  $nvcc_command
+  TMPDIR=${temp_dir} $nvcc_command
 fi
 error_code=$?

--- a/lib/kokkos/cmake/CTestConfig.cmake.in
+++ b/lib/kokkos/cmake/CTestConfig.cmake.in
@ -1,91 +0,0 @@
-#----------------------------------------------------------------------------------------#
-#
-#   CTestConfig.cmake template for Kokkos
-#
-#----------------------------------------------------------------------------------------#
-
-#
-#   dash-board related
-#
-set(CTEST_PROJECT_NAME "Kokkos")
-set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
-set(CTEST_DROP_METHOD "https")
-set(CTEST_DROP_SITE "cdash.nersc.gov")
-set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
-set(CTEST_CDASH_VERSION "1.6")
-set(CTEST_CDASH_QUERY_VERSION TRUE)
-set(CTEST_SUBMIT_RETRY_COUNT "1")
-set(CTEST_SUBMIT_RETRY_DELAY "30")
-
-#
-#   configure/build related
-#
-set(CTEST_BUILD_NAME "@BUILD_NAME@")
-set(CTEST_MODEL "@MODEL@")
-set(CTEST_SITE "@SITE@")
-set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
-set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
-set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
-
-#
-#   configure/build related
-#
-set(CTEST_UPDATE_TYPE "git")
-set(CTEST_UPDATE_VERSION_ONLY ON)
-# set(CTEST_GENERATOR "")
-# set(CTEST_GENERATOR_PLATFORM "")
-
-#
-#   testing related
-#
-set(CTEST_TIMEOUT "7200")
-set(CTEST_TEST_TIMEOUT "7200")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
-set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
-
-#
-#   coverage related
-#
-set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
-
-#
-#   commands
-#
-if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
-    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
-endif()
-set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
-set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
-set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
-if(NOT WIN32)
-    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
-endif()
-set(CTEST_COVERAGE_COMMAND "gcov")
-set(CTEST_MEMORYCHECK_COMMAND "valgrind")
-set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
-
-#
-#   various configs
-#
-set(APPEND_VALUE @APPEND@)
-if(APPEND_VALUE)
-    set(APPEND_CTEST APPEND)
-endif()
-
-macro(SET_TEST_PROP VAR)
-    if(NOT "${ARGS}" STREQUAL "")
-        set(${VAR}_CTEST ${VAR} ${ARGN})
-    endif()
-endmacro()
-
-set_test_prop(START           @START@)
-set_test_prop(END             @END@)
-set_test_prop(STRIDE          @STRIDE@)
-set_test_prop(INCLUDE         @INCLUDE@)
-set_test_prop(EXCLUDE         @EXCLUDE@)
-set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
-set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
-set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
-set_test_prop(STOP_TIME       @STOP_TIME@)
-set_test_prop(COVERAGE_LABELS @LABELS@)
--- a/lib/kokkos/cmake/KokkosCI.cmake
+++ b/lib/kokkos/cmake/KokkosCI.cmake
@ -1,350 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-message(STATUS "")
-
-get_cmake_property(_cached_vars CACHE_VARIABLES)
-set(KOKKOS_CMAKE_ARGS)
-set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
-                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
-list(SORT _cached_vars)
-foreach(_var ${_cached_vars})
-    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
-        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
-        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
-            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-        endif()
-    endif()
-endforeach()
-
-
-#----------------------------------------------------------------------------------------#
-#
-#   Macros and variables
-#
-#----------------------------------------------------------------------------------------#
-
-macro(CHECK_REQUIRED VAR)
-    if(NOT DEFINED ${VAR})
-        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
-    endif()
-endmacro()
-
-# require the build name variable
-CHECK_REQUIRED(BUILD_NAME)
-
-# uses all args
-macro(SET_DEFAULT VAR)
-    if(NOT DEFINED ${VAR})
-        set(${VAR} ${ARGN})
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# uses first arg -- useful for selecting via priority from multiple
-# potentially defined variables, e.g.:
-#
-#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
-#
-macro(SET_DEFAULT_ARG1 VAR)
-    if(NOT DEFINED ${VAR})
-        foreach(_ARG ${ARGN})
-            if(NOT "${_ARG}" STREQUAL "")
-                set(${VAR} ${_ARG})
-                break()
-            endif()
-        endforeach()
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# determine the default working directory
-if(NOT "$ENV{WORKSPACE}" STREQUAL "")
-    set(WORKING_DIR "$ENV{WORKSPACE}")
-else()
-    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
-endif()
-
-# determine the hostname
-execute_process(COMMAND hostname
-    OUTPUT_VARIABLE HOSTNAME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
-
-# get the number of processors
-include(ProcessorCount)
-ProcessorCount(NUM_PROCESSORS)
-
-# find git
-find_package(Git QUIET)
-if(NOT GIT_EXECUTABLE)
-    unset(GIT_EXECUTABLE CACHE)
-    unset(GIT_EXECUTABLE)
-endif()
-
-function(EXECUTE_GIT_COMMAND VAR)
-    set(${VAR} "" PARENT_SCOPE)
-    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
-        OUTPUT_VARIABLE VAL
-        RESULT_VARIABLE RET
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
-        ERROR_QUIET)
-    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
-    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
-    if(RET EQUAL 0)
-        set(${VAR} "${VAL}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_BRANCH_NAME VAR)
-    execute_git_command(GIT_BRANCH branch --show-current)
-    set(_INVALID "%D" "HEAD")
-    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-        execute_git_command(GIT_BRANCH show -s --format=%D)
-        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-            execute_git_command(GIT_BRANCH --describe all)
-        endif()
-    endif()
-    #
-    if(GIT_BRANCH)
-        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
-        # just set it to last one via loop instead of wonky cmake index manip
-        foreach(_ITR ${_DESC})
-            set(GIT_BRANCH "${_ITR}")
-        endforeach()
-        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
-        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_AUTHOR_NAME VAR)
-    execute_git_command(GIT_AUTHOR show -s --format=%an)
-    if(GIT_AUTHOR)
-        string(LENGTH "${GIT_AUTHOR}" STRLEN)
-        # if the build name gets too long, this can cause submission errors
-        if(STRLEN GREATER 24)
-            # remove middle initial
-            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
-            # get first and sur name
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
-            if(S_NAME)
-                set(GIT_AUTHOR "${S_NAME}")
-            elseif(F_NAME)
-                set(GIT_AUTHOR "${F_NAME}")
-            endif()
-        endif()
-        # remove any spaces, quotes, periods, etc.
-        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
-        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
-        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
-    endif()
-endfunction()
-
-# get the name of the branch
-GET_GIT_BRANCH_NAME(GIT_BRANCH)
-# get the name of the author
-GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
-# author, prefer git method for consistency
-SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
-# SLUG == owner_name/repo_name
-SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
-# branch name
-SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
-# pull request number
-SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
-# get the event type, e.g. push, pull_request, api, cron, etc.
-SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
-
-if("${BRANCH}" STREQUAL "")
-    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
-    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
-endif()
-
-#----------------------------------------------------------------------------------------#
-#
-#   Set default values if not provided on command-line
-#
-#----------------------------------------------------------------------------------------#
-
-SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
-SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
-SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
-SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
-SET_DEFAULT(JOBS            1)                          # number of parallel ctests
-SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
-SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
-SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
-SET_DEFAULT(TARGET          "all")                      # build target
-SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
-                            "${HOSTNAME}")              # update site
-SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
-                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
-#
-#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
-#   '-I START,END,STRIDE'
-#
-SET_DEFAULT(START           "")
-SET_DEFAULT(END             "")
-SET_DEFAULT(STRIDE          "")
-SET_DEFAULT(INCLUDE         "")
-SET_DEFAULT(EXCLUDE         "")
-SET_DEFAULT(INCLUDE_LABEL   "")
-SET_DEFAULT(EXCLUDE_LABEL   "")
-SET_DEFAULT(PARALLEL_LEVEL  "")
-SET_DEFAULT(STOP_TIME       "")
-SET_DEFAULT(LABELS          "")
-SET_DEFAULT(NOTES           "")
-
-# default static build tag for Nightly
-set(BUILD_TAG "${BRANCH}")
-
-if(NOT BUILD_TYPE)
-    # default for kokkos if not specified
-    set(BUILD_TYPE "RelWithDebInfo")
-endif()
-
-# generate dynamic name if continuous or experimental model
-if(NOT "${MODEL}" STREQUAL "Nightly")
-    if(EVENT_TYPE AND PULL_REQUEST_NUM)
-        # e.g. pull_request/123
-        if(AUTHOR)
-            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        else()
-            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        endif()
-    elseif(SLUG)
-        # e.g. owner_name/repo_name
-        set(BUILD_TAG "${SLUG}")
-    elseif(AUTHOR)
-        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
-    endif()
-    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
-        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
-    endif()
-endif()
-
-# unnecessary
-string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
-string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
-
-message(STATUS "BUILD_TAG: ${BUILD_TAG}")
-
-set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
-
-# colons in build name create extra (empty) entries in CDash
-string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
-# unnecessary info
-string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
-# consistency
-string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
-# miscellaneous from missing fields
-string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
-
-# check binary directory
-if(EXISTS ${BINARY_DIR})
-    if(NOT IS_DIRECTORY "${BINARY_DIR}")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
-    endif()
-    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
-    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
-    endif()
-endif()
-
-get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
-get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
-
-#----------------------------------------------------------------------------------------#
-#
-#   Generate the CTestConfig.cmake
-#
-#----------------------------------------------------------------------------------------#
-
-set(CONFIG_ARGS)
-foreach(_ARG ${KOKKOS_CMAKE_ARGS})
-    if(NOT "${${_ARG}}" STREQUAL "")
-        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
-        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
-            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
-                set(_ARG_TYPE "BOOL")
-            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "FILEPATH")
-            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "PATH")
-            elseif(NOT "${${_ARG}}" STREQUAL "")
-                set(_ARG_TYPE "STRING")
-            endif()
-        endif()
-        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
-    endif()
-endforeach()
-
-file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
-"
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
-${CONFIG_ARGS}
-")
-
-file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
-message(STATUS "Initial cache:\n${_CACHE_INFO}")
-
-# initialize the cache
-set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
-
-
-# generate the CTestConfig.cmake
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
-    ${BINARY_REALDIR}/CTestConfig.cmake
-    @ONLY)
-
-# copy/generate the dashboard script
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
-    ${BINARY_REALDIR}/KokkosCTest.cmake
-    @ONLY)
-
-# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
-execute_process(
-    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-#----------------------------------------------------------------------------------------#
-#
-#   Execute CTest
-#
-#----------------------------------------------------------------------------------------#
-
-message(STATUS "")
-message(STATUS "BUILD_NAME: ${BUILD_NAME}")
-message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
-message(STATUS "")
-
-# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
-string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
-
-execute_process(
-    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
-    RESULT_VARIABLE     RET
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-# ensure that any non-zero result variable gets propagated
-if(NOT RET EQUAL 0)
-    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
-endif()
--- a/lib/kokkos/cmake/KokkosCTest.cmake.in
+++ b/lib/kokkos/cmake/KokkosCTest.cmake.in
@ -1,261 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-endif()
-
-include(ProcessorCount)
-ProcessorCount(CTEST_PROCESSOR_COUNT)
-
-cmake_policy(SET CMP0009 NEW)
-cmake_policy(SET CMP0011 NEW)
-
-# ---------------------------------------------------------------------------- #
-# -- Commands
-# ---------------------------------------------------------------------------- #
-find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
-find_program(CTEST_UNAME_COMMAND    NAMES uname)
-
-find_program(CTEST_BZR_COMMAND      NAMES bzr)
-find_program(CTEST_CVS_COMMAND      NAMES cvs)
-find_program(CTEST_GIT_COMMAND      NAMES git)
-find_program(CTEST_HG_COMMAND       NAMES hg)
-find_program(CTEST_P4_COMMAND       NAMES p4)
-find_program(CTEST_SVN_COMMAND      NAMES svn)
-
-find_program(VALGRIND_COMMAND       NAMES valgrind)
-find_program(GCOV_COMMAND           NAMES gcov)
-find_program(LCOV_COMMAND           NAMES llvm-cov)
-find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
-
-set(MEMORYCHECK_TYPE Valgrind)
-# set(MEMORYCHECK_TYPE Purify)
-# set(MEMORYCHECK_TYPE BoundsChecker)
-# set(MEMORYCHECK_TYPE ThreadSanitizer)
-# set(MEMORYCHECK_TYPE AddressSanitizer)
-# set(MEMORYCHECK_TYPE LeakSanitizer)
-# set(MEMORYCHECK_TYPE MemorySanitizer)
-# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
-set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
-
-# ---------------------------------------------------------------------------- #
-# -- Settings
-# ---------------------------------------------------------------------------- #
-## -- Process timeout in seconds
-set(CTEST_TIMEOUT           "7200")
-## -- Set output to English
-set(ENV{LC_MESSAGES}        "en_EN" )
-
-
-# ---------------------------------------------------------------------------- #
-# -- Copy ctest configuration file
-# ---------------------------------------------------------------------------- #
-macro(COPY_CTEST_CONFIG_FILES)
-
-    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
-
-        # if current directory is not binary or source directory
-        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
-           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in current directory
-            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
-                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        # if source and binary differ
-        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in source directory but not in binary directory
-            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
-               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
-                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        endif()
-    endforeach()
-
-endmacro()
-
-ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
-
-message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
-
-#-------------------------------------------------------------------------#
-# Start
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
-message(STATUS "")
-
-ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
-    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
-
-
-#-------------------------------------------------------------------------#
-# Config
-#
-copy_ctest_config_files()
-ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
-
-
-#-------------------------------------------------------------------------#
-# Update
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
-message(STATUS "")
-
-ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
-    RETURN_VALUE up_ret)
-
-
-#-------------------------------------------------------------------------#
-# Configure
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
-message(STATUS "")
-
-ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
-    SOURCE ${CTEST_SOURCE_DIRECTORY}
-    ${APPEND_CTEST}
-    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
-    RETURN_VALUE config_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo configure log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_configure_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Configure Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Build
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
-message(STATUS "")
-
-ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
-    ${APPEND_CTEST}
-    RETURN_VALUE build_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo build log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_build_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Build Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Test
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
-message(STATUS "")
-
-ctest_test(RETURN_VALUE test_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST}
-    ${STOP_TIME_CTEST}
-    SCHEDULE_RANDOM OFF)
-
-
-#-------------------------------------------------------------------------#
-# Coverage
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
-message(STATUS "")
-
-execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
-    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
-    ERROR_QUIET)
-
-ctest_coverage(${APPEND_CTEST}
-    ${CTEST_COVERAGE_LABELS}
-    RETURN_VALUE cov_ret)
-
-
-#-------------------------------------------------------------------------#
-# MemCheck
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
-message(STATUS "")
-
-ctest_memcheck(RETURN_VALUE mem_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST})
-
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
-message(STATUS "")
-
-file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
-foreach(_FILE ${NOTE_FILES})
-    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
-    include("${_FILE}")
-endforeach()
-
-# capture submit error so it doesn't fail because of a submission error
-ctest_submit(RETURN_VALUE submit_ret
-    RETRY_COUNT 2
-    RETRY_DELAY 10
-    CAPTURE_CMAKE_ERROR submit_err)
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
-message(STATUS "")
-
-
-#-------------------------------------------------------------------------#
-# Non-zero exit codes for important errors
-#
-if(NOT config_ret EQUAL 0)
-    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
-endif()
-
-if(NOT build_ret EQUAL 0)
-    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
-endif()
-
-if(NOT test_ret EQUAL 0)
-    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
-endif()
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@ -41,6 +41,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
@ -49,17 +50,21 @@
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
+#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
+#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION

 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
+#cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND

 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@ -79,6 +84,12 @@
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
+#cmakedefine KOKKOS_ARCH_INTEL_DG1
+#cmakedefine KOKKOS_ARCH_INTEL_GEN9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN11
+#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP
+#cmakedefine KOKKOS_ARCH_INTEL_XEHP
+#cmakedefine KOKKOS_ARCH_INTEL_GPU
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@ -95,6 +106,7 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
+#cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
--- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
@ -29,7 +29,12 @@ ELSE()
 ENDIF()

 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1")
+ELSE()
+  SET(KOKKOS_CUDA_ERROR DEFAULT_MSG)
+ENDIF()
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER)
 IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
  KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
    LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
--- a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
+++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
@ -0,0 +1 @@
+KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)
--- a/lib/kokkos/cmake/deps/quadmath.cmake
+++ b/lib/kokkos/cmake/deps/quadmath.cmake
@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "" )
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")


 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
    "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
    "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")

+  # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
+  IF(Kokkos_ENABLE_LIBQUADMATH)
+    # warning: non-standard suffix on floating constant [-Wpedantic]
+    LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
+  ENDIF()
+
  # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
  IF(KOKKOS_ENABLE_OPENMPTARGET)
    LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)

  COMPILER_SPECIFIC_FLAGS(
    COMPILER_ID CMAKE_CXX_COMPILER_ID
-    PGI         NO-VALUE-SPECIFIED
+    NVHPC       NO-VALUE-SPECIFIED
    GNU         ${GNU_WARNINGS}
    DEFAULT     ${COMMON_WARNINGS}
  )
@ -158,16 +169,18 @@ ENDIF()

 IF (KOKKOS_ARCH_ARMV80)
  COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -march=armv8-a
  )
 ENDIF()

 IF (KOKKOS_ARCH_ARMV81)
  COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -march=armv8.1-a
  )
 ENDIF()
@ -175,8 +188,9 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
  SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
  COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -march=armv8-a -mtune=thunderx
  )
 ENDIF()
@ -184,23 +198,28 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
  SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
  COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99
  )
 ENDIF()

 IF (KOKKOS_ARCH_A64FX)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -march=armv8.2-a+sve
-    Clang -march=armv8.2-a+sve -msve-vector-bits=512
-    GCC -march=armv8.2-a+sve -msve-vector-bits=512
+    Clang   -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC     -march=armv8.2-a+sve -msve-vector-bits=512
  )
 ENDIF()

 IF (KOKKOS_ARCH_ZEN)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -mavx2
+    NVHPC   -tp=zen
    DEFAULT -march=znver1 -mtune=znver1
  )
  SET(KOKKOS_ARCH_AMD_ZEN  ON)
@ -209,7 +228,9 @@ ENDIF()

 IF (KOKKOS_ARCH_ZEN2)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -mavx2
+    NVHPC   -tp=zen2
    DEFAULT -march=znver2 -mtune=znver2
  )
  SET(KOKKOS_ARCH_AMD_ZEN2 ON)
@ -218,7 +239,9 @@ ENDIF()

 IF (KOKKOS_ARCH_ZEN3)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -mavx2
+    NVHPC   -tp=zen2
    DEFAULT -march=znver3 -mtune=znver3
  )
  SET(KOKKOS_ARCH_AMD_ZEN3 ON)
@ -227,8 +250,9 @@ ENDIF()

 IF (KOKKOS_ARCH_WSM)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -xSSE4.2
-    PGI     -tp=nehalem
+    NVHPC   -tp=px
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -msse4.2
  )
@ -238,8 +262,9 @@ ENDIF()
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
  SET(KOKKOS_ARCH_AVX ON)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -mavx
-    PGI     -tp=sandybridge
+    NVHPC   -tp=sandybridge
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -mavx
  )
@ -248,8 +273,9 @@ ENDIF()
 IF (KOKKOS_ARCH_HSW)
  SET(KOKKOS_ARCH_AVX2 ON)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -march=core-avx2 -mtune=core-avx2
  )
@ -258,8 +284,9 @@ ENDIF()
 IF (KOKKOS_ARCH_BDW)
  SET(KOKKOS_ARCH_AVX2 ON)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm
  )
@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL)
  #avx512-mic
  SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -xMIC-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=knl
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -march=knl -mtune=knl
  )
@ -279,6 +307,7 @@ ENDIF()
 IF (KOKKOS_ARCH_KNC)
  SET(KOKKOS_USE_ISA_KNC ON)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    DEFAULT -mmic
  )
 ENDIF()
@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX)
  #avx512-xeon
  SET(KOKKOS_ARCH_AVX512XEON ON)
  COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
    Intel   -xCORE-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=skylake
    Cray    NO-VALUE-SPECIFIED
    DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
  )
@ -304,7 +334,8 @@ ENDIF()

 IF (KOKKOS_ARCH_POWER7)
  COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
    DEFAULT -mcpu=power7 -mtune=power7
  )
  SET(KOKKOS_USE_ISA_POWERPCBE ON)
@ -312,16 +343,16 @@ ENDIF()

 IF (KOKKOS_ARCH_POWER8)
  COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr8
    DEFAULT -mcpu=power8 -mtune=power8
  )
 ENDIF()

 IF (KOKKOS_ARCH_POWER9)
  COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr9
    DEFAULT -mcpu=power9 -mtune=power9
  )
 ENDIF()
@ -368,7 +399,7 @@ ENDIF()

 IF (KOKKOS_ENABLE_SYCL)
  COMPILER_SPECIFIC_FLAGS(
-    DEFAULT -fsycl
+    DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int
  )
  COMPILER_SPECIFIC_OPTIONS(
    DEFAULT -fsycl-unnamed-lambda
@ -443,20 +474,58 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
 CHECK_AMDGPU_ARCH(VEGA908 gfx908)
+CHECK_AMDGPU_ARCH(VEGA90A gfx90a)

 IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
-                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator)
+    EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS)
+    STRING(LENGTH "${GPU_ARCHS}" len_str)
+    # enumerator always output gfx000 as the first line
+    IF(${len_str} LESS 8)
+      MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                         "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+    ENDIF()
+  ELSE()
+    MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                       "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  ENDIF()
+ENDIF()
+
+MACRO(CHECK_MULTIPLE_INTEL_ARCH)
+  IF(KOKKOS_ARCH_INTEL_GPU)
+    MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!")
+  ENDIF()
+  SET(KOKKOS_ARCH_INTEL_GPU ON)
+ENDMACRO()
+
+IF(KOKKOS_ARCH_INTEL_GEN)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_DG1)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN9)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN11)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN12LP)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_XEHP)
+  CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()

 IF (KOKKOS_ENABLE_OPENMPTARGET)
  SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
  IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
    COMPILER_SPECIFIC_FLAGS(
      Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
-      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
-      PGI -gpu=${PGI_CUDA_ARCH}
+      XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      NVHPC -gpu=${NVHPC_CUDA_ARCH}
    )
  ENDIF()
  SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
      Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
    )
  ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GEN)
+  IF (KOKKOS_ARCH_INTEL_GPU)
    COMPILER_SPECIFIC_FLAGS(
      IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
    )
@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL)
    ENDIF()
  ELSEIF(KOKKOS_ARCH_INTEL_GEN)
    COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
    )
  ENDIF()
 ENDIF()
--- a/lib/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake
@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      4.0.0 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        5.3.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     17.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.2.88 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.8.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      4.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.4 or higher\n")

 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
  ENDIF()
  SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0)
    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
  ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4)
    MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
  ENDIF()
+  # Treat PGI internally as NVHPC to simplify handling both compilers.
+  # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is
+  # backward-compatible to pgc++.
+  SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
+ENDIF()
+
+IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
 ENDIF()

 STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION})
--- a/lib/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake
@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP)
      COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
      Clang      -Xcompiler ${ClangOpenMPFlag}
      IntelLLVM  -Xcompiler -fiopenmp
-      PGI        -Xcompiler -mp
+      NVHPC      -Xcompiler -mp
      Cray       NO-VALUE-SPECIFIED
      XL         -Xcompiler -qsmp=omp
      DEFAULT    -Xcompiler -fopenmp
@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP)
      Clang      ${ClangOpenMPFlag}
      IntelLLVM  -fiopenmp
      AppleClang -Xpreprocessor -fopenmp
-      PGI        -mp
+      NVHPC      -mp
      Cray       NO-VALUE-SPECIFIED
      XL         -qsmp=omp
      DEFAULT    -fopenmp
@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
    Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
    IntelLLVM  -fiopenmp -Wno-openmp-mapping
    XL         -qsmp=omp -qoffload -qnoeh
-    PGI        -mp=gpu
+    NVHPC      -mp=gpu
    DEFAULT    -fopenmp
  )
  COMPILER_SPECIFIC_DEFS(
--- a/lib/kokkos/cmake/kokkos_enable_options.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_options.cmake
@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)

+# Set the Default for Desul Atomics usage.
+set(_DESUL_ATOMICS_DEFAULT ON)
+
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
+# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    ON "Whether code deprecated in major release 3 is available" )
+KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
 KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tu
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
 KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")

+# This option will go away eventually, but allows fallback to old implementation when needed.
+KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+
 IF (KOKKOS_ENABLE_CUDA)
  SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
 ENDIF()
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()

 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
  CMAKE_PARSE_ARGUMENTS(
    PARSE
    "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
--- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
  IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
    INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake)
    kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
    INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake)
    kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
--- a/lib/kokkos/cmake/kokkos_tpls.cmake
+++ b/lib/kokkos/cmake/kokkos_tpls.cmake
@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread)

+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath)
+  SET(LIBQUADMATH_DEFAULT ON)
+ELSE()
+  SET(LIBQUADMATH_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)

 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+KOKKOS_IMPORT_TPL(LIBQUADMATH)

 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
--- a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
+++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@ -48,7 +48,7 @@
 #include <Kokkos_DynRankView.hpp>
 #include <vector>

-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 // Compare performance of DynRankView to View, specific focus on the parenthesis
 // operators
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@ -48,7 +48,7 @@
 #include <vector>
 #include <algorithm>

-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 // This test will simulate global ids

--- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
@ -46,7 +46,7 @@
 #define KOKKOS_TEST_SCATTER_VIEW_HPP

 #include <Kokkos_ScatterView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 namespace Perf {

--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@ -43,7 +43,7 @@
 #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP

-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 #include <iostream>
 #include <iomanip>
--- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@ -76,20 +76,25 @@ class Bitset {
  using execution_space = Device;
  using size_type       = unsigned int;

-  enum { BIT_SCAN_REVERSE = 1u };
-  enum { MOVE_HINT_BACKWARD = 2u };
+  static constexpr unsigned BIT_SCAN_REVERSE   = 1u;
+  static constexpr unsigned MOVE_HINT_BACKWARD = 2u;

-  enum {
-    BIT_SCAN_FORWARD_MOVE_HINT_FORWARD  = 0u,
-    BIT_SCAN_REVERSE_MOVE_HINT_FORWARD  = BIT_SCAN_REVERSE,
-    BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD,
-    BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
-  };
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD =
+      BIT_SCAN_REVERSE;
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD =
+      MOVE_HINT_BACKWARD;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD =
+      BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD;

 private:
-  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
-  enum { block_mask = block_size - 1u };
-  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+  enum : unsigned {
+    block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT)
+  };
+  enum : unsigned { block_mask = block_size - 1u };
+  enum : unsigned {
+    block_shift = Kokkos::Impl::integral_power_of_two(block_size)
+  };

 public:
  /// constructor
@ -317,14 +322,18 @@ class ConstBitset {
  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };

 public:
+  KOKKOS_FUNCTION
  ConstBitset() : m_size(0) {}

+  KOKKOS_FUNCTION
  ConstBitset(Bitset<Device> const& rhs)
      : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}

+  KOKKOS_FUNCTION
  ConstBitset(ConstBitset<Device> const& rhs)
      : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}

+  KOKKOS_FUNCTION
  ConstBitset<Device>& operator=(Bitset<Device> const& rhs) {
    this->m_size   = rhs.m_size;
    this->m_blocks = rhs.m_blocks;
@ -332,6 +341,7 @@ class ConstBitset {
    return *this;
  }

+  KOKKOS_FUNCTION
  ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) {
    this->m_size   = rhs.m_size;
    this->m_blocks = rhs.m_blocks;
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@ -597,8 +597,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
    }
    if (std::is_same<typename t_host::memory_space,
                     typename t_dev::memory_space>::value) {
-      typename t_dev::execution_space().fence();
-      typename t_host::execution_space().fence();
+      typename t_dev::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
+      typename t_host::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
    }
  }

@ -776,10 +778,11 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
  /// If \c Device is the same as this DualView's device type, then
  /// mark the device's data as modified.  Otherwise, mark the host's
  /// data as modified.
-  template <class Device>
+  template <class Device, class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
  void modify() {
    if (modified_flags.data() == nullptr) return;
-    if (impl_dualview_is_single_device::value) return;
    int dev = get_device_side<Device>();

    if (dev == 1) {  // if Device is the same as DualView's device type
@ -811,8 +814,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 #endif
  }

+  template <
+      class Device, class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  void modify() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
  inline void modify_host() {
-    if (impl_dualview_is_single_device::value) return;
    if (modified_flags.data() != nullptr) {
      modified_flags(0) =
          (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@ -832,8 +844,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
    }
  }

+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_host() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
  inline void modify_device() {
-    if (impl_dualview_is_single_device::value) return;
    if (modified_flags.data() != nullptr) {
      modified_flags(1) =
          (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@ -853,6 +874,13 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
    }
  }

+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_device() {
+    return;
+  }
+
  inline void clear_sync_state() {
    if (modified_flags.data() != nullptr)
      modified_flags(1) = modified_flags(0) = 0;
@ -875,8 +903,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-    h_view = create_mirror_view(d_view);
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
+    if (sizeMismatch) {
+      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      h_view = create_mirror_view(d_view);
+    } else
+      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});

    /* Reset dirty flags */
    if (modified_flags.data() == nullptr) {
@ -897,41 +932,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
              const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
              const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
              const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
    if (modified_flags.data() == nullptr) {
      modified_flags = t_modified_flags("DualView::modified_flags");
    }
    if (modified_flags(1) >= modified_flags(0)) {
      /* Resize on Device */
-      ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(d_view);
-
-      /* Mark Device copy as modified */
-      modified_flags(1) = modified_flags(1) + 1;
+      if (sizeMismatch) {
+        ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        h_view = create_mirror_view(d_view);

+        /* Mark Device copy as modified */
+        modified_flags(1) = modified_flags(1) + 1;
+      }
    } else {
      /* Realloc on Device */
-
-      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-
-      const bool sizeMismatch =
-          (h_view.extent(0) != n0) || (h_view.extent(1) != n1) ||
-          (h_view.extent(2) != n2) || (h_view.extent(3) != n3) ||
-          (h_view.extent(4) != n4) || (h_view.extent(5) != n5) ||
-          (h_view.extent(6) != n6) || (h_view.extent(7) != n7);
-      if (sizeMismatch)
+      if (sizeMismatch) {
        ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        d_view = create_mirror_view(typename t_dev::execution_space(), h_view);

-      t_host temp_view = create_mirror_view(d_view);
-
-      /* Remap on Host */
-      Kokkos::deep_copy(temp_view, h_view);
-
-      h_view = temp_view;
-
-      d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
-
-      /* Mark Host copy as modified */
-      modified_flags(0) = modified_flags(0) + 1;
+        /* Mark Host copy as modified */
+        modified_flags(0) = modified_flags(0) + 1;
+      }
    }
  }

--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
    // to avoid incomplete type errors from usng Kokkos::Cuda directly.
    if (std::is_same<Kokkos::CudaUVMSpace,
                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
    }
 #endif
    //------------------------------------------------------------
@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
    if (std::is_same<Kokkos::CudaUVMSpace,
                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
    }
 #endif
    //------------------------------------------------------------
@ -1404,7 +1406,7 @@ class ViewMapping<

  template <class MemoryTraits>
  struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");

    using traits_type =
        Kokkos::ViewTraits<data_type, array_layout,
@ -1574,7 +1576,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs,
 namespace Kokkos {
 namespace Impl {

-template <class OutputView, typename Enable = void>
+template <class OutputView, class Enable = void>
 struct DynRankViewFill {
  using const_value_type = typename OutputView::traits::const_value_type;

@ -1693,9 +1695,11 @@ inline void deep_copy(
                   typename ViewTraits<DT, DP...>::value_type>::value,
      "deep_copy requires non-const type");

-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence before filling view");
  Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence after filling view");
 }

 /** \brief  Deep copy into a value in Host memory from a view.  */
@ -1711,10 +1715,13 @@ inline void deep_copy(

  using src_traits       = ViewTraits<ST, SP...>;
  using src_memory_space = typename src_traits::memory_space;
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence before copying "
+      "value");
  Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                      sizeof(ST));
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence after copying value");
 }

 //----------------------------------------------------------------------------
@ -1744,14 +1751,14 @@ inline void deep_copy(

  enum {
    DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
  };

  enum {
    SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
  };

  if ((void*)dst.data() != (void*)src.data()) {
@ -1762,10 +1769,14 @@ inline void deep_copy(
    // memory then can byte-wise copy
    if (rank(src) == 0 && rank(dst) == 0) {
      using value_type = typename dst_type::value_type;
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-0 views");
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), sizeof(value_type));
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-0 views");
    } else if (std::is_same<
                   typename DstType::traits::value_type,
                   typename SrcType::traits::non_const_value_type>::value &&
@ -1787,10 +1798,14 @@ inline void deep_copy(
               dst.extent(6) == src.extent(6) &&
               dst.extent(7) == src.extent(7)) {
      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
    } else if (std::is_same<
                   typename DstType::traits::value_type,
                   typename SrcType::traits::non_const_value_type>::value &&
@ -1817,29 +1832,43 @@ inline void deep_copy(
               dst.stride_6() == src.stride_6() &&
               dst.stride_7() == src.stride_7()) {
      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
      Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
          dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
    } else if (DstExecCanAccessSrc) {
      // Copying data between views in accessible memory spaces and either
      // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
      Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
    } else if (SrcExecCanAccessDst) {
      // Copying data between views in accessible memory spaces and either
      // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
      Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
          dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
    } else {
      Kokkos::Impl::throw_runtime_exception(
          "deep_copy given views that would require a temporary allocation");
    }
  } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same "
+        "src and dst");
  }
 }

--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@ -53,36 +53,203 @@
 namespace Kokkos {
 namespace Experimental {

-// Simple metafunction for choosing memory space
-// In the current implementation, if memory_space == CudaSpace,
-// use CudaUVMSpace for the chunk 'array' allocation, which
-// contains will contain pointers to chunks of memory allocated
-// in CudaSpace
 namespace Impl {
-template <class MemSpace>
-struct ChunkArraySpace {
-  using memory_space = MemSpace;
+
+/// Utility class to manage memory for chunked arrays on the host and
+/// device. Allocates/deallocates memory on both the host and device along with
+/// providing utilities for creating mirrors and deep copying between them.
+template <typename MemorySpace, typename ValueType>
+struct ChunkedArrayManager {
+  using value_type   = ValueType;
+  using pointer_type = ValueType*;
+  using track_type   = Kokkos::Impl::SharedAllocationTracker;
+
+  ChunkedArrayManager()                           = default;
+  ChunkedArrayManager(ChunkedArrayManager const&) = default;
+  ChunkedArrayManager(ChunkedArrayManager&&)      = default;
+  ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default;
+  ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default;
+
+  template <typename Space, typename Value>
+  friend struct ChunkedArrayManager;
+
+  template <typename Space, typename Value>
+  inline ChunkedArrayManager(const ChunkedArrayManager<Space, Value>& rhs)
+      : m_valid(rhs.m_valid),
+        m_chunk_max(rhs.m_chunk_max),
+        m_chunks((ValueType**)(rhs.m_chunks)),
+        m_track(rhs.m_track),
+        m_chunk_size(rhs.m_chunk_size) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<MemorySpace, Space>::assignable,
+        "Incompatible ChunkedArrayManager copy construction");
+  }
+
+  ChunkedArrayManager(const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ private:
+  struct ACCESSIBLE_TAG {};
+  struct INACCESSIBLE_TAG {};
+
+  ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks,
+                      const unsigned arg_chunk_max)
+      : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {}
+
+  ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ public:
+  template <typename Space, typename Enable_ = void>
+  struct IsAccessibleFrom;
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::true_type {};
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<!Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::false_type {};
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    return ChunkedArrayManager<Space, ValueType>{
+        ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
+  }
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    using tag_type =
+        typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
+    return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
+                                                 other.m_chunk_size};
+  }
+
+ public:
+  void allocate_device(const std::string& label) {
+    if (m_chunks == nullptr) {
+      m_chunks = reinterpret_cast<pointer_type*>(MemorySpace().allocate(
+          label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2))));
+    }
+  }
+
+  void initialize() {
+    for (unsigned i = 0; i < m_chunk_max + 2; i++) {
+      m_chunks[i] = nullptr;
+    }
+    m_valid = true;
+  }
+
+ private:
+  /// Custom destroy functor for deallocating array chunks along with a linked
+  /// allocation
+  template <typename Space>
+  struct Destroy {
+    Destroy()               = default;
+    Destroy(Destroy&&)      = default;
+    Destroy(const Destroy&) = default;
+    Destroy& operator=(Destroy&&) = default;
+    Destroy& operator=(const Destroy&) = default;
+
+    Destroy(std::string label, value_type** arg_chunk,
+            const unsigned arg_chunk_max, const unsigned arg_chunk_size,
+            value_type** arg_linked)
+        : m_label(label),
+          m_chunks(arg_chunk),
+          m_linked(arg_linked),
+          m_chunk_max(arg_chunk_max),
+          m_chunk_size(arg_chunk_size) {}
+
+    void execute() {
+      // Destroy the array of chunk pointers.
+      // Two entries beyond the max chunks are allocation counters.
+      uintptr_t const len =
+          *reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+      for (unsigned i = 0; i < len; i++) {
+        Space().deallocate(m_label.c_str(), m_chunks[i],
+                           sizeof(value_type) * m_chunk_size);
+      }
+      // Destroy the linked allocation if we have one.
+      if (m_linked != nullptr) {
+        Space().deallocate(m_label.c_str(), m_linked,
+                           (sizeof(value_type*) * (m_chunk_max + 2)));
+      }
+    }
+
+    void destroy_shared_allocation() { execute(); }
+
+    std::string m_label;
+    value_type** m_chunks = nullptr;
+    value_type** m_linked = nullptr;
+    unsigned m_chunk_max;
+    unsigned m_chunk_size;
+  };
+
+ public:
+  template <typename Space>
+  void allocate_with_destroy(const std::string& label,
+                             pointer_type* linked_allocation = nullptr) {
+    using destroy_type = Destroy<Space>;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<MemorySpace, destroy_type>;
+
+    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
+    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
+    // Destroy's execute(...) method
+    record_type* const record = record_type::allocate(
+        MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2)));
+    m_chunks = static_cast<pointer_type*>(record->data());
+    m_track.assign_allocated_record_to_uninitialized(record);
+
+    record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size,
+                                     linked_allocation);
+  }
+
+  pointer_type* get_ptr() const { return m_chunks; }
+
+  template <typename Space>
+  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const& other) {
+    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
+        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
+  }
+
+  template <typename Space>
+  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const&) {
+    // no-op
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type* operator+(int i) const { return m_chunks + i; }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type& operator[](int i) const { return m_chunks[i]; }
+
+  track_type const& track() const { return m_track; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool valid() const { return m_valid; }
+
+ private:
+  bool m_valid           = false;
+  unsigned m_chunk_max   = 0;
+  pointer_type* m_chunks = nullptr;
+  track_type m_track;
+  unsigned m_chunk_size = 0;
 };

-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ChunkArraySpace<Kokkos::CudaSpace> {
-  using memory_space = typename Kokkos::CudaUVMSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
-  using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
-  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
-};
-#endif
-}  // end namespace Impl
+} /* end namespace Impl */

 /** \brief Dynamic views are restricted to rank-one and no layout.
 *         Resize only occurs on host outside of parallel_regions.
@ -93,6 +260,13 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 public:
  using traits = Kokkos::ViewTraits<DataType, P...>;

+  using value_type   = typename traits::value_type;
+  using device_space = typename traits::memory_space;
+  using host_space =
+      typename Kokkos::Impl::HostMirror<device_space>::Space::memory_space;
+  using device_accessor = Impl::ChunkedArrayManager<device_space, value_type>;
+  using host_accessor   = Impl::ChunkedArrayManager<host_space, value_type>;
+
 private:
  template <class, class...>
  friend class DynamicView;
@ -108,7 +282,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                "DynamicView only implemented for non-specialized View type");

  template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
+                             Space, device_space>::accessible>
  struct verify_space {
    KOKKOS_FORCEINLINE_FUNCTION static void check() {}
  };
@ -123,9 +297,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  };

 private:
-  track_type m_track;
-  typename traits::value_type** m_chunks =
-      nullptr;             // array of pointers to 'chunks' of memory
+  device_accessor m_chunks;
+  host_accessor m_chunks_host;
  unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
  unsigned m_chunk_mask;   // m_chunk_size - 1
  unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@ -173,7 +346,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {

  KOKKOS_INLINE_FUNCTION
  size_t allocation_extent() const noexcept {
-    uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max);
+    uintptr_t n =
+        *reinterpret_cast<const uintptr_t*>(m_chunks_host + m_chunk_max);
    return (n << m_chunk_shift);
  }

@ -183,7 +357,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  KOKKOS_INLINE_FUNCTION
  size_t size() const noexcept {
    size_t extent_0 =
-        *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1);
+        *reinterpret_cast<const size_t*>(m_chunks_host + m_chunk_max + 1);
    return extent_0;
  }

@ -215,10 +389,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  // Allocation tracking properties

  KOKKOS_INLINE_FUNCTION
-  int use_count() const { return m_track.use_count(); }
+  int use_count() const { return m_chunks_host.track().use_count(); }

  inline const std::string label() const {
-    return m_track.template get_label<typename traits::memory_space>();
+    return m_chunks_host.track().template get_label<host_space>();
  }

  //----------------------------------------------------------------------
@ -285,13 +459,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   *          up to the maximum number of chunks
   * */
  template <typename IntType>
-  inline typename std::enable_if<
-      std::is_integral<IntType>::value &&
-      Kokkos::Impl::MemorySpaceAccess<
-          Kokkos::HostSpace,
-          typename Impl::ChunkArraySpace<
-              typename traits::memory_space>::memory_space>::accessible>::type
-  resize_serial(IntType const& n) {
+  inline void resize_serial(IntType const& n) {
    using local_value_type   = typename traits::value_type;
    using value_pointer_type = local_value_type*;

@ -304,37 +472,40 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    }

    // *m_chunks[m_chunk_max] stores the current number of chunks being used
-    uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
-    std::string _label =
-        m_track.template get_label<typename traits::memory_space>();
+    uintptr_t* const pc =
+        reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
+    std::string _label = m_chunks_host.track().template get_label<host_space>();
+
    if (*pc < NC) {
      while (*pc < NC) {
-        m_chunks[*pc] = reinterpret_cast<value_pointer_type>(
-            typename traits::memory_space().allocate(
+        m_chunks_host[*pc] =
+            reinterpret_cast<value_pointer_type>(device_space().allocate(
                _label.c_str(), sizeof(local_value_type) << m_chunk_shift));
        ++*pc;
      }
    } else {
      while (NC + 1 <= *pc) {
        --*pc;
-        typename traits::memory_space().deallocate(
-            _label.c_str(), m_chunks[*pc],
-            sizeof(local_value_type) << m_chunk_shift);
-        m_chunks[*pc] = nullptr;
+        device_space().deallocate(_label.c_str(), m_chunks_host[*pc],
+                                  sizeof(local_value_type) << m_chunk_shift);
+        m_chunks_host[*pc] = nullptr;
      }
    }
-    // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+    // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
    *(pc + 1) = n;
+
+    m_chunks_host.deep_copy_to(m_chunks);
  }

  KOKKOS_INLINE_FUNCTION bool is_allocated() const {
-    if (m_chunks == nullptr) {
-      return false;
-    } else {
-      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+    if (m_chunks_host.valid()) {
+      // *m_chunks_host[m_chunk_max] stores the current number of chunks being
+      // used
      uintptr_t* const pc =
-          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+          reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
      return (*(pc + 1) > 0);
+    } else {
+      return false;
    }
  }

@ -349,8 +520,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {

  template <class RT, class... RP>
  DynamicView(const DynamicView<RT, RP...>& rhs)
-      : m_track(rhs.m_track),
-        m_chunks((typename traits::value_type**)rhs.m_chunks),
+      : m_chunks(rhs.m_chunks),
+        m_chunks_host(rhs.m_chunks_host),
        m_chunk_shift(rhs.m_chunk_shift),
        m_chunk_mask(rhs.m_chunk_mask),
        m_chunk_max(rhs.m_chunk_max),
@ -361,63 +532,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                  "Incompatible DynamicView copy construction");
  }

-  //----------------------------------------------------------------------
-
-  struct Destroy {
-    using local_value_type = typename traits::value_type;
-    std::string m_label;
-    local_value_type** m_chunks;
-    unsigned m_chunk_max;
-    bool m_destroy;
-    unsigned m_chunk_size;
-
-    // Initialize or destroy array of chunk pointers.
-    // Two entries beyond the max chunks are allocation counters.
-    inline void operator()(unsigned i) const {
-      if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) {
-        typename traits::memory_space().deallocate(
-            m_label.c_str(), m_chunks[i],
-            sizeof(local_value_type) * m_chunk_size);
-      }
-      m_chunks[i] = nullptr;
-    }
-
-    void execute(bool arg_destroy) {
-      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
-
-      m_destroy = arg_destroy;
-
-      Kokkos::Impl::ParallelFor<Destroy, Range> closure(
-          *this,
-          Range(0, m_chunk_max + 2));  // Add 2 to 'destroy' extra slots storing
-                                       // num_chunks and extent; previously + 1
-
-      closure.execute();
-
-      typename traits::execution_space().fence();
-      // Impl::ChunkArraySpace< typename traits::memory_space
-      // >::memory_space::execution_space().fence();
-    }
-
-    void construct_shared_allocation() { execute(false); }
-
-    void destroy_shared_allocation() { execute(true); }
-
-    Destroy()               = default;
-    Destroy(Destroy&&)      = default;
-    Destroy(const Destroy&) = default;
-    Destroy& operator=(Destroy&&) = default;
-    Destroy& operator=(const Destroy&) = default;
-
-    Destroy(std::string label, typename traits::value_type** arg_chunk,
-            const unsigned arg_chunk_max, const unsigned arg_chunk_size)
-        : m_label(label),
-          m_chunks(arg_chunk),
-          m_chunk_max(arg_chunk_max),
-          m_destroy(false),
-          m_chunk_size(arg_chunk_size) {}
-  };
-
  /**\brief  Allocation constructor
   *
   *  Memory is allocated in chunks
@ -427,10 +541,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  explicit inline DynamicView(const std::string& arg_label,
                              const unsigned min_chunk_size,
                              const unsigned max_extent)
-      : m_track(),
-        m_chunks(nullptr)
-        // The chunk size is guaranteed to be a power of two
-        ,
+      :  // The chunk size is guaranteed to be a power of two
        m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
            min_chunk_size))  // div ceil(log2(min_chunk_size))
        ,
@ -440,28 +551,22 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                    m_chunk_shift)  // max num pointers-to-chunks in array
        ,
        m_chunk_size(2 << (m_chunk_shift - 1)) {
-    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space;
-    // A functor to deallocate all of the chunks upon final destruction
-    using record_type =
-        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
+    m_chunks = device_accessor(m_chunk_max, m_chunk_size);

-    // Allocate chunk pointers and allocation counter
-    record_type* const record =
-        record_type::allocate(chunk_array_memory_space(), arg_label,
-                              (sizeof(pointer_type) * (m_chunk_max + 2)));
-    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
-    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
-    // Destroy's execute(...) method
-
-    m_chunks = reinterpret_cast<pointer_type*>(record->data());
-
-    record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size);
-
-    // Initialize to zero
-    record->m_destroy.construct_shared_allocation();
-
-    m_track.assign_allocated_record_to_uninitialized(record);
+    if (device_accessor::template IsAccessibleFrom<host_space>::value) {
+      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.initialize();
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+    } else {
+      m_chunks.allocate_device(arg_label);
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+      m_chunks_host.template allocate_with_destroy<device_space>(
+          arg_label, m_chunks.get_ptr());
+      m_chunks_host.initialize();
+      m_chunks_host.deep_copy_to(m_chunks);
+    }
  }
 };

@ -487,8 +592,8 @@ inline void deep_copy(const View<T, DP...>& dst,

  enum {
    DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
  };

  if (DstExecCanAccessSrc) {
@ -512,8 +617,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,

  enum {
    DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
  };

  if (DstExecCanAccessSrc) {
--- a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@ -187,7 +187,8 @@ template <typename ReportType, typename DeviceType>
 void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
  m_reports.resize(new_size);
  m_reporters.resize(new_size);
-  typename DeviceType::execution_space().fence();
+  typename DeviceType::execution_space().fence(
+      "Kokkos::Experimental::ErrorReporter::resize: fence after resizing");
 }

 }  // namespace Experimental
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
      This check should cover the case of Views that don't
      have the Unmanaged trait but were initialized by pointer. */
    if (tracker.has_record()) {
-      Kokkos::Impl::operator_bounds_error_on_device<MapType>(
-          map, Kokkos::Impl::has_printable_label_typedef<MapType>());
+      Kokkos::Impl::operator_bounds_error_on_device(map);
    } else {
      Kokkos::abort("OffsetView bounds error");
    }
@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
    // to avoid incomplete type errors from usng Kokkos::Cuda directly.
    if (std::is_same<Kokkos::CudaUVMSpace,
                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
    }
 #endif
    //------------------------------------------------------------
@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
    if (std::is_same<Kokkos::CudaUVMSpace,
                     typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
    }
 #endif
    //------------------------------------------------------------
--- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
@ -834,7 +834,7 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
    static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                  "ScatterView contribute destination has different layout");
    static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
            execution_space, typename dest_type::memory_space>::accessible,
        "ScatterView contribute destination memory space not accessible");
    if (dest.data() == internal_view.data()) return;
@ -1061,7 +1061,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                               Kokkos::LayoutRight>::value,
                  "ScatterView deep_copy destination has different layout");
    static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
            execution_space, typename dest_type::memory_space>::accessible,
        "ScatterView deep_copy destination memory space not accessible");
    bool is_equal = (dest.data() == internal_view.data());
@ -1290,7 +1290,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                               Kokkos::LayoutLeft>::value,
                  "ScatterView deep_copy destination has different layout");
    static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
            execution_space, typename dest_type::memory_space>::accessible,
        "ScatterView deep_copy destination memory space not accessible");
    auto extent   = internal_view.extent(internal_view_type::rank - 1);
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@ -405,7 +405,9 @@ class StaticCrsGraph {
    Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning",
                         Kokkos::RangePolicy<execution_space>(0, numRows()),
                         partitioner);
-    typename device_type::execution_space().fence();
+    typename device_type::execution_space().fence(
+        "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after "
+        "partition");

    row_block_offsets = block_offsets;
  }
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@ -345,7 +345,8 @@ class UnorderedMap {
      const impl_value_type tmp = impl_value_type();
      Kokkos::deep_copy(m_values, tmp);
    }
-    { Kokkos::deep_copy(m_scalars, 0); }
+    Kokkos::deep_copy(m_scalars, 0);
+    m_size = 0;
  }

  KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@ -393,9 +394,9 @@ class UnorderedMap {
  ///
  /// This method has undefined behavior when erasable() is true.
  ///
-  /// Note that this is not a device function; it cannot be called in
+  /// Note that this is <i>not</i> a device function; it cannot be called in
  /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
+  /// must be computed. m_size is a mutable cache of that value.
  size_type size() const {
    if (capacity() == 0u) return 0u;
    if (modified()) {
@ -419,9 +420,13 @@ class UnorderedMap {
  bool begin_erase() {
    bool result = !erasable();
    if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence before setting erasable "
+          "flag");
      set_flag(erasable_idx);
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence after setting erasable "
+          "flag");
    }
    return result;
  }
@ -429,10 +434,12 @@ class UnorderedMap {
  bool end_erase() {
    bool result = erasable();
    if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence before erasing");
      Impl::UnorderedMapErase<declared_map_type> f(*this);
      f.apply();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence after erasing");
      reset_flag(erasable_idx);
    }
    return result;
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@ -119,12 +119,14 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
    if (DV::template need_sync<typename DV::t_dev::device_type>()) {
      set_functor_host f(DV::h_view, val);
      parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_host::execution_space().fence();
+      typename DV::t_host::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
      DV::template modify<typename DV::t_host::device_type>();
    } else {
      set_functor f(DV::d_view, val);
      parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_dev::execution_space().fence();
+      typename DV::t_dev::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
      DV::template modify<typename DV::t_dev::device_type>();
    }
  }
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@ -57,22 +57,10 @@
 namespace Kokkos {
 namespace Impl {

-KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_left(unsigned i, int r) {
-  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  return r ? ((i << r) | (i >> (size - r))) : i;
-}
-
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  // FIXME_SYCL llvm.fshr.i32 missing
-  // (https://github.com/intel/llvm/issues/3308)
-#ifdef __SYCL_DEVICE_ONLY__
-  return rotate_left(i, size - r);
-#else
  return r ? ((i >> r) | (i << (size - r))) : i;
-#endif
 }

 template <typename Bitset>
--- a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) {

 KOKKOS_INLINE_FUNCTION
 uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
-  const uint8_t* data = (const uint8_t*)key;
+  const uint8_t* data = static_cast<const uint8_t*>(key);
  const int nblocks   = len / 4;

  uint32_t h1 = seed;
--- a/lib/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@ -49,7 +49,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_DualView.hpp>

 namespace Test {
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -702,6 +702,11 @@ class TestDynViewAPI {

  using View0 = Kokkos::View<T, device>;
  using View1 = Kokkos::View<T*, device>;
+  using View2 = Kokkos::View<T**, device>;
+  using View3 = Kokkos::View<T***, device>;
+  using View4 = Kokkos::View<T****, device>;
+  using View5 = Kokkos::View<T*****, device>;
+  using View6 = Kokkos::View<T******, device>;
  using View7 = Kokkos::View<T*******, device>;

  using host_view_space = typename View0::host_mirror_space;
@ -1065,7 +1070,7 @@ class TestDynViewAPI {

    dView0 d_uninitialized(
        Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20);
-    ASSERT_TRUE(d_uninitialized.data() != nullptr);
+    ASSERT_NE(d_uninitialized.data(), nullptr);
    ASSERT_EQ(d_uninitialized.rank(), 2);
    ASSERT_EQ(d_uninitialized.extent(0), 10);
    ASSERT_EQ(d_uninitialized.extent(1), 20);
@ -1075,14 +1080,14 @@ class TestDynViewAPI {
    hView0 hx, hy, hz;

    ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value);
-    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value);
+    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double>>::value);

-    ASSERT_TRUE(dx.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dy.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dz.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dy.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dz.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
    ASSERT_EQ(dx.extent(0), 0u);  // Okay with UVM
    ASSERT_EQ(dy.extent(0), 0u);  // Okay with UVM
    ASSERT_EQ(dz.extent(0), 0u);  // Okay with UVM
@ -1153,11 +1158,11 @@ class TestDynViewAPI {

    ASSERT_EQ(dx.use_count(), size_t(2));

-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
    ASSERT_NE(dx, dy);

    ASSERT_EQ(dx.extent(0), unsigned(N0));
@ -1317,17 +1322,17 @@ class TestDynViewAPI {
    ASSERT_NE(dx, dz);

    dx = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
    dy = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
    dz = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);

    // View - DynRankView Interoperability tests
    // deep_copy from view to dynrankview
@ -1367,7 +1372,7 @@ class TestDynViewAPI {
  static void check_auto_conversion_to_const(
      const Kokkos::DynRankView<const DataType, device>& arg_const,
      const Kokkos::DynRankView<DataType, device>& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
  }

  static void run_test_allocated() {
@ -1396,8 +1401,8 @@ class TestDynViewAPI {
    const_typeX xc = x;
    const_typeR xr = x;

-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);

    // For CUDA the constant random access View does not return
    // an lvalue reference due to retrieving through texture cache
@ -1406,7 +1411,7 @@ class TestDynViewAPI {
    if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
    {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
    }

    // typeX xf = xc ; // setting non-const from const must not compile
@ -1659,29 +1664,29 @@ class TestDynViewAPI {
    const_svector_right_type cvr3 =
        Kokkos::subdynrankview(mv, Kokkos::ALL(), 2);

-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));

-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));

-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));

-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));

-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));

    const_svector_type c_cv1(v1);
    typename svector_type::const_type c_cv2(v2);
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@ -52,7 +52,7 @@
 #include <Kokkos_Core.hpp>

 #include <Kokkos_DynamicView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 namespace Test {

--- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
@ -50,7 +50,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>

--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
      scatter_access(k, 3)++;
      scatter_access(k, 4)--;
      scatter_access(k, 5) -= 5;
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      scatter_access_atomic(k, 6) += 2;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      scatter_access_atomic(k, 7)++;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      scatter_access_atomic(k, 8)--;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      --scatter_access_atomic(k, 9);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      ++scatter_access_atomic(k, 10);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
      scatter_access(k, 11) -= 3;
    }
  }
--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@ -180,8 +180,6 @@ void run_test_graph3(size_t B, size_t N) {

  std::vector<size_t> sizes(LENGTH);

-  size_t total_length = 0;
-
  for (size_t i = 0; i < LENGTH; ++i) {
    sizes[i] = rand() % 1000;
  }
@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) {
  sizes[1]    = N;
  sizes[1998] = N;

-  for (size_t i = 0; i < LENGTH; ++i) {
-    total_length += sizes[i];
-  }
-
  int C    = 0;
  dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes);
  dx.create_block_partitioning(B, C);
--- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) {
 }

 // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
-// FIXME_HIP
 // WORKAROUND MSVC
-#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
-    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
+#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
  for (int i = 0; i < 500; ++i) {
    test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
  ASSERT_TRUE(n.is_allocated());
 }

+TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
+  using Map =
+      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+
+  Map m(11);
+  ASSERT_EQ(0u, m.size());
+
+  m.insert(2);
+  m.insert(3);
+  m.insert(5);
+  m.insert(7);
+  ASSERT_EQ(4u, m.size());
+
+  m.clear();
+  ASSERT_EQ(0u, m.size());
+}
+
 }  // namespace Test

 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@ -1,104 +0,0 @@
-/* The trivial 'src/build_common.sh' creates a config
- * that must stay in sync with this file.
- */
-#cmakedefine KOKKOS_FOR_SIERRA
-
-#if !defined(KOKKOS_FOR_SIERRA)
-
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error \
-    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA
-#cmakedefine KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_OPENMP
-#cmakedefine KOKKOS_ENABLE_THREADS
-#cmakedefine KOKKOS_ENABLE_SERIAL
-#cmakedefine KOKKOS_ENABLE_Winthread
-
-#cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
-#cmakedefine KOKKOS_ENABLE_LIBRT
-
-#cmakedefine KOKKOS_ENABLE_DEBUG
-#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-#cmakedefine KOKKOS_ENABLE_TUNING
-
-#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-
-// mfh 16 Sep 2014: If passed in on the command line, that overrides
-// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
-// warnings like this one:
-//
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
-// "KOKKOS_USE_CUDA_UVM" redefined
-//
-// At some point, we should edit the test-build scripts in
-// Trilinos/cmake/ctest/drivers/perseus/, and take
-// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
-// hesitate to do that now, because I'm not sure if all the files are
-// including KokkosCore_config.h (or a header file that includes it) like
-// they should.
-#ifndef KOKKOS_USE_CUDA_UVM
-#cmakedefine KOKKOS_USE_CUDA_UVM
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
-
-#endif
-
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_ISA_X86_64
-#cmakedefine KOKKOS_ENABLE_ISA_KNC
-#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
-#endif
-
-#ifdef KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#endif
-
-#cmakedefine KOKKOS_ARCH_ARMV80 1
-#cmakedefine KOKKOS_ARCH_ARMV81 1
-#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
-#cmakedefine KOKKOS_ARCH_AVX 1
-#cmakedefine KOKKOS_ARCH_AVX2 1
-#cmakedefine KOKKOS_ARCH_AVX512MIC 1
-#cmakedefine KOKKOS_ARCH_AVX512XEON 1
-#cmakedefine KOKKOS_ARCH_KNC 1
-#cmakedefine KOKKOS_ARCH_POWER8 1
-#cmakedefine KOKKOS_ARCH_POWER9 1
-#cmakedefine KOKKOS_ARCH_KEPLER 1
-#cmakedefine KOKKOS_ARCH_KEPLER30 1
-#cmakedefine KOKKOS_ARCH_KEPLER32 1
-#cmakedefine KOKKOS_ARCH_KEPLER35 1
-#cmakedefine KOKKOS_ARCH_KEPLER37 1
-#cmakedefine KOKKOS_ARCH_MAXWELL 1
-#cmakedefine KOKKOS_ARCH_MAXWELL50 1
-#cmakedefine KOKKOS_ARCH_MAXWELL52 1
-#cmakedefine KOKKOS_ARCH_MAXWELL53 1
-#cmakedefine KOKKOS_ARCH_PASCAL 1
-#cmakedefine KOKKOS_ARCH_PASCAL60 1
-#cmakedefine KOKKOS_ARCH_PASCAL61 1
-#cmakedefine KOKKOS_ARCH_VOLTA70 1
-
-// TODO: These are currently not used in Kokkos.  Should they be removed?
-#cmakedefine KOKKOS_ENABLE_MPI
-#cmakedefine KOKKOS_ENABLE_CUSPARSE
-
-// TODO: No longer options in Kokkos.  Need to be removed.
-#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
-
-#endif  // !defined(KOKKOS_FOR_SIERRA)
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@ -10,9 +10,7 @@
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")

 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
-IF (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
-         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
  RETURN()
 ENDIF()

--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,

    std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
              << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << ", " << avg_seconds << std::endl;
  }
 }

--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,

    std::cout << label_hexgrad << " , " << parallel_work_length << " , "
              << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << avg_seconds << std::endl;
  }
 }

--- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) {
  double time_end = timer.seconds();

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
  }
  printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
         time_overlap);
@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) {
  double time_not_fenced = timer.seconds();
  Kokkos::fence();
  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
  }

  timer.reset();
@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) {
  ASSERT_EQ(h_result2(), h_result());

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
  }
  printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
         time_no_overlapped_reduce, time_overlapped_reduce);
@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) {
  double time_end = timer.seconds();

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
  }
  printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
         time_overlap);
@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) {
  double time_not_fenced = timer.seconds();
  Kokkos::fence();
  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
  }

  timer.reset();
@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) {
  ASSERT_EQ(h_result2(), h_result());

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
  }
  printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
         time_no_overlapped_reduce, time_overlapped_reduce);
@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) {
  double time_end = timer.seconds();

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
  }
  printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
         time_overlap);
@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) {
  double time_not_fenced = timer.seconds();
  Kokkos::fence();
  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
  }
  timer.reset();
  Kokkos::parallel_reduce(
@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) {
  ASSERT_EQ(h_result2(), h_result());

  if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
  }
  printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
         time_no_overlapped_reduce, time_overlapped_reduce);
--- a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) {
  {
    Kokkos::Timer timer;
    for (int r = 0; r < R; r++) {
-      double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8);
+      double* a_ptr =
+          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
      Kokkos::parallel_for(
          N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
      Kokkos::fence();
--- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@ -47,10 +47,18 @@
 namespace Test {

 TEST(default_exec, ViewResize_Rank8) {
+// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
+#ifdef KOKKOS_ENABLE_SYCL
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+#else
  printf("Resize View Performance for LayoutLeft:\n");
  run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
  printf("Resize View Performance for LayoutRight:\n");
  run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+#endif
 }

 }  // namespace Test
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@ -47,7 +47,7 @@
 #include <cstdlib>

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 using exec_space = Kokkos::DefaultExecutionSpace;

@ -401,7 +401,7 @@ template <class T>
 void Loop(int loop, int test, const char* type_name) {
  LoopVariant<T>(loop, test);

-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
  T res       = LoopVariant<T>(loop, test);
  double time = timer.seconds();

--- a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@ -12,13 +12,13 @@
 #include <typeinfo>

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 using exec_space = Kokkos::DefaultExecutionSpace;

 template <typename T>
 void test(const int length) {
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;

  using vector = Kokkos::View<T*, exec_space>;

--- a/lib/kokkos/core/perf_test/test_mempool.cpp
+++ b/lib/kokkos/core/perf_test/test_mempool.cpp
@ -48,7 +48,7 @@
 #include <limits>

 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@ -100,7 +100,7 @@ struct TestFunctor {

      const unsigned size_alloc = chunk * (1 + (j % chunk_span));

-      ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+      ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));

      if (ptrs(j)) ++update;
    }
@ -129,7 +129,7 @@ struct TestFunctor {

      const unsigned size_alloc = chunk * (1 + (j % chunk_span));

-      pool.deallocate((void*)ptrs(j), size_alloc);
+      pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
    }
  }

@ -153,9 +153,9 @@ struct TestFunctor {
        for (unsigned k = 0; k < repeat_inner; ++k) {
          const unsigned size_alloc = chunk * (1 + (j % chunk_span));

-          pool.deallocate((void*)ptrs(j), size_alloc);
+          pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);

-          ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+          ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));

          if (0 == ptrs(j)) update++;
        }
@ -266,7 +266,7 @@ int main(int argc, char* argv[]) {
    TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                        fill_stride, chunk_span, repeat_inner);

-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;

    if (!functor.test_fill()) {
      Kokkos::abort("fill ");
--- a/lib/kokkos/core/perf_test/test_taskdag.cpp
+++ b/lib/kokkos/core/perf_test/test_taskdag.cpp
@ -56,7 +56,7 @@ int main() { return 0; }
 #include <cstdlib>
 #include <limits>

-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>

 using ExecSpace = Kokkos::DefaultExecutionSpace;

@ -220,7 +220,7 @@ int main(int argc, char* argv[]) {
    double time_sum = 0;

    for (int i = 0; i < test_repeat_outer; ++i) {
-      Kokkos::Impl::Timer timer;
+      Kokkos::Timer timer;

      Functor::FutureType ftmp =
          Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input));
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -9,6 +9,8 @@ INSTALL (DIRECTORY
  "${CMAKE_CURRENT_SOURCE_DIR}/"
  DESTINATION ${KOKKOS_HEADER_DIR}
  FILES_MATCHING
+  PATTERN "*.inc"
+  PATTERN "*.inc_*"
  PATTERN "*.hpp"
  PATTERN "*.h"
 )
@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL)
  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()

+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+ENDIF()
+
+
 KOKKOS_ADD_LIBRARY(
  kokkoscore
  SOURCES ${KOKKOS_CORE_SRCS}
@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+
+# FIXME: We need a proper solution to figure out whether to enable
+#        libatomic
+# XL requires libatomic even for 64 bit CAS, most others only for 128
+# I (CT) had removed 128bit CAS from desul to not need libatomic.
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+    (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
+  target_link_libraries(kokkoscore PUBLIC atomic)
+ENDIF()
+
+
+KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -90,43 +90,25 @@ static std::atomic<int> num_uvm_allocations(0);

 }  // namespace

-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+void DeepCopyCuda(void *dst, const void *src, size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
 }

-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
+void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
+                       size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
 }

 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
  cudaStream_t s = cuda_get_deep_copy_stream();
-  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
-  cudaStreamSynchronize(s);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+  Impl::cuda_stream_synchronize(
+      s,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
 }

 }  // namespace Impl
@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {

 namespace Kokkos {

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 KOKKOS_DEPRECATED void CudaSpace::access_error() {
  const std::string msg(
      "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
      "non-Cuda space");
  Kokkos::Impl::throw_runtime_exception(msg);
 }
+#endif

 /*--------------------------------------------------------------------------*/

@ -164,9 +148,11 @@ bool CudaUVMSpace::available() {

 /*--------------------------------------------------------------------------*/

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 int CudaUVMSpace::number_of_allocations() {
  return Kokkos::Impl::num_uvm_allocations.load();
 }
+#endif
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
 // The purpose of the following variable is to allow a state-based choice
 // for pinning UVM allocations to the CPU. For now this is considered
@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}

 CudaHostPinnedSpace::CudaHostPinnedSpace() {}

+int memory_threshold_g = 40000;  // 40 kB
+
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1

@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate(
    const Kokkos::Tools::SpaceHandle arg_handle) const {
  void *ptr = nullptr;

+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  cudaError_t error_code;
+  if (arg_alloc_size >= memory_threshold_g) {
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
  auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+#endif
  if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
    cudaGetLastError();  // This is the only way to clear the last error, which
                         // we should do here since we're turning it into an
@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate(
    const Kokkos::Tools::SpaceHandle arg_handle) const {
  void *ptr = nullptr;

-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation");
  if (arg_alloc_size > 0) {
    Kokkos::Impl::num_uvm_allocations++;

@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate(
              CudaMallocManaged);
    }
  }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
  if (Kokkos::Profiling::profileLibraryLoaded()) {
    const size_t reported_size =
        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate(
    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                      reported_size);
  }
-
  try {
-    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+    if (arg_alloc_size >= memory_threshold_g) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    }
+#else
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#endif
  } catch (...) {
  }
 }
@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate(
    ,
    const size_t arg_logical_size,
    const Kokkos::Tools::SpaceHandle arg_handle) const {
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
  if (Kokkos::Profiling::profileLibraryLoaded()) {
    const size_t reported_size =
        (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate(
  try {
    if (arg_alloc_ptr != nullptr) {
      Kokkos::Impl::num_uvm_allocations--;
-      CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
    }
  } catch (...) {
  }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation");
 }

 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate(
                                      reported_size);
  }
  try {
-    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
  } catch (...) {
  }
 }
@ -462,7 +477,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
  resDesc.res.linear.sizeInBytes = alloc_size;
  resDesc.res.linear.devPtr      = alloc_ptr;

-  CUDA_SAFE_CALL(
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
      cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));

  return tex_obj;
@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                           bool to_device) {
  if ((ptr == nullptr) || (bytes == 0)) return;
  cudaPointerAttributes attr;
-  CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
  // I measured this and it turns out prefetching towards the host slows
  // DualView syncs down. Probably because the latency is not too bad in the
  // first place for the pull down. If we want to change that provde
@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 #endif
  if (to_device && is_managed &&
      space.cuda_device_prop().concurrentManagedAccess) {
-    CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
-                                        space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
+        ptr, bytes, space.cuda_device(), space.cuda_stream()));
  }
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination,
    }

    if (blocks_per_sm >= min_blocks_per_sm) {
-      if (threads_per_sm >= opt_threads_per_sm) {
+      // The logic prefers smaller block sizes over larger ones to
+      // give more flexibility to the scheduler.
+      // But don't go below 128 where performance suffers significantly
+      // for simple copy/set kernels.
+      if ((threads_per_sm > opt_threads_per_sm) ||
+          ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) {
        opt_block_size     = block_size;
        opt_threads_per_sm = threads_per_sm;
      }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@ -49,13 +49,19 @@
 #ifdef KOKKOS_ENABLE_CUDA

 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 #include <iosfwd>

 namespace Kokkos {
 namespace Impl {

-void cuda_device_synchronize();
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string& name);
+void cuda_device_synchronize(const std::string& name);
+void cuda_stream_synchronize(const cudaStream_t stream,
+                             const std::string& name);

 void cuda_internal_error_throw(cudaError e, const char* name,
                               const char* file = nullptr, const int line = 0);
@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name,
  }
 }

-#define CUDA_SAFE_CALL(call) \
+#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \
  Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__)

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name,
+                                               const char* file = nullptr,
+                                               const int line   = 0) {
+  cuda_internal_safe_call(e, name, file, line);
+}
+
+#define CUDA_SAFE_CALL(call)                                              \
+  Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                   __LINE__)
+
+#endif
+
 }  // namespace Impl

 namespace Experimental {
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@ -60,6 +60,7 @@

 #include <Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda_Error.hpp>

 namespace Kokkos {
 namespace Impl {
@ -82,8 +83,8 @@ struct GraphImpl<Kokkos::Cuda> {
    constexpr size_t error_log_size = 256;
    cudaGraphNode_t error_node      = nullptr;
    char error_log[error_log_size];
-    CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node,
-                                        error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
    // TODO @graphs print out errors
  }

@ -107,26 +108,27 @@ struct GraphImpl<Kokkos::Cuda> {
    // TODO @graphs we need to somehow indicate the need for a fence in the
    //              destructor of the GraphImpl object (so that we don't have to
    //              just always do it)
-    m_execution_space.fence();
+    m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
    KOKKOS_EXPECTS(bool(m_graph))
    if (bool(m_graph_exec)) {
-      CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
    }
-    CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
  };

  explicit GraphImpl(Kokkos::Cuda arg_instance)
      : m_execution_space(std::move(arg_instance)) {
-    CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
  }

  void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
    // All of the predecessors are just added as normal, so all we need to
    // do here is add an empty node
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node),
-                                         m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
  }

  template <class NodeImpl>
@ -171,7 +173,7 @@ struct GraphImpl<Kokkos::Cuda> {
    auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node;
    KOKKOS_EXPECTS(bool(cuda_node))

-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
        cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
  }

@ -179,7 +181,7 @@ struct GraphImpl<Kokkos::Cuda> {
    if (!bool(m_graph_exec)) {
      _instantiate_graph();
    }
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
        cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
  }

@ -192,9 +194,10 @@ struct GraphImpl<Kokkos::Cuda> {
    KOKKOS_EXPECTS(!bool(m_graph_exec))
    auto rv = std::make_shared<root_node_impl_t>(
        get_execution_space(), _graph_node_is_root_ctor_tag{});
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
    KOKKOS_ENSURES(bool(rv->node_details_t::node))
    return rv;
  }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@ -51,6 +51,9 @@
    !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) ||  \
      defined(KOKKOS_ARCH_MAXWELL52))
 #include <cuda_fp16.h>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+#include <Kokkos_NumericTraits.hpp>  // reduction_identity

 #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION
    std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
        cast_from_half(half_t);

-class half_t {
+class alignas(2) half_t {
 public:
  using impl_type = Kokkos::Impl::half_impl_t::type;

@ -138,6 +141,22 @@ class half_t {
  KOKKOS_FUNCTION
  half_t() : val(0.0F) {}

+  // Copy constructors
+  KOKKOS_DEFAULTED_FUNCTION
+  half_t(const half_t&) noexcept = default;
+
+  KOKKOS_INLINE_FUNCTION
+  half_t(const volatile half_t& rhs) {
+#ifdef __CUDA_ARCH__
+    val = rhs.val;
+#else
+    const volatile uint16_t* rv_ptr =
+        reinterpret_cast<const volatile uint16_t*>(&rhs.val);
+    const uint16_t rv_val = *rv_ptr;
+    val                   = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // __CUDA_ARCH__
+  }
+
  // Don't support implicit conversion back to impl_type.
  // impl_type is a storage only type on host.
  KOKKOS_FUNCTION
@ -219,7 +238,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    tmp.val = +tmp.val;
 #else
-    tmp.val   = __float2half(+__half2float(tmp.val));
+    tmp.val               = __float2half(+__half2float(tmp.val));
 #endif
    return tmp;
  }
@ -230,7 +249,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    tmp.val = -tmp.val;
 #else
-    tmp.val   = __float2half(-__half2float(tmp.val));
+    tmp.val               = __float2half(-__half2float(tmp.val));
 #endif
    return tmp;
  }
@ -241,7 +260,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    ++val;
 #else
-    float tmp = __half2float(val);
+    float tmp             = __half2float(val);
    ++tmp;
    val       = __float2half(tmp);
 #endif
@ -255,7 +274,7 @@ class half_t {
 #else
    float tmp = __half2float(val);
    --tmp;
-    val = __float2half(tmp);
+    val     = __float2half(tmp);
 #endif
    return *this;
  }
@ -290,7 +309,10 @@ class half_t {

  template <class T>
  KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    val = cast_to_half(rhs).val;
+    impl_type new_val = cast_to_half(rhs).val;
+    volatile uint16_t* val_ptr =
+        reinterpret_cast<volatile uint16_t*>(const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<uint16_t&>(new_val);
  }

  // Compound operators
@ -299,30 +321,21 @@ class half_t {
 #ifdef __CUDA_ARCH__
    val += rhs.val;
 #else
-    val = __float2half(__half2float(val) + __half2float(rhs.val));
+    val     = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
    return *this;
  }

  KOKKOS_FUNCTION
-  volatile half_t& operator+=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) + rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator+=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
  }

-  // Compund operators: upcast overloads for +=
+  // Compound operators: upcast overloads for +=
  template <class T>
  KOKKOS_FUNCTION std::enable_if_t<
      std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
@ -350,27 +363,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
    val -= rhs.val;
 #else
-    val          = __float2half(__half2float(val) - __half2float(rhs.val));
+    val     = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
    return *this;
  }

  KOKKOS_FUNCTION
-  volatile half_t& operator-=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) - rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator-=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
  }

  // Compund operators: upcast overloads for -=
@ -401,27 +405,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
    val *= rhs.val;
 #else
-    val          = __float2half(__half2float(val) * __half2float(rhs.val));
+    val     = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
    return *this;
  }

  KOKKOS_FUNCTION
-  volatile half_t& operator*=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) * rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator*=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
  }

  // Compund operators: upcast overloads for *=
@ -452,27 +447,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
    val /= rhs.val;
 #else
-    val          = __float2half(__half2float(val) / __half2float(rhs.val));
+    val     = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
    return *this;
  }

  KOKKOS_FUNCTION
-  volatile half_t& operator/=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) / rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator/=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
  }

  // Compund operators: upcast overloads for /=
@ -504,7 +490,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    lhs.val += rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
    return lhs;
  }
@ -529,7 +515,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    lhs.val -= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
    return lhs;
  }
@ -554,7 +540,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    lhs.val *= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
    return lhs;
  }
@ -579,7 +565,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
    lhs.val /= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
    return lhs;
  }
@ -683,6 +669,62 @@ class half_t {
    return __half2float(val) >= __half2float(rhs.val);
 #endif
  }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os, const half_t& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, half_t& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
 };

 // CUDA before 11.1 only has the half <-> float conversions marked host device
@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif
 }  // namespace Experimental
+
+// use float as the return type for sum and prod since cuda_fp16.h
+// has no constexpr functions for casting to __half
+template <>
+struct reduction_identity<Kokkos::Experimental::half_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -65504.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 65504.0F;
+  }
+};
+
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
 #endif  // KOKKOS_ENABLE_CUDA
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -119,7 +119,7 @@ int cuda_kernel_arch() {
  int arch    = 0;
  int *d_arch = nullptr;

-  cudaMalloc((void **)&d_arch, sizeof(int));
+  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
  cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);

  query_cuda_kernel_arch<<<1, 1>>>(d_arch);
@ -141,7 +141,36 @@ bool cuda_launch_blocking() {

 }  // namespace

-void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); }
+void cuda_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      });
+}
+
+void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
+                             const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          ptr->impl_get_instance_id()},
+      [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
+
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name, reason, [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}

 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                               const int line) {
@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() {
  // See 'cudaSetDeviceFlags' for host-device thread interaction
  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual

-  CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));

  if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
    Kokkos::abort(
@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() {
        "have. Please report this to github.com/kokkos/kokkos.");
  }
  for (int i = 0; i < m_cudaDevCount; ++i) {
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
  }
 }

@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() {
              << std::endl;
  }

-  m_cudaDev                   = -1;
-  m_cudaArch                  = -1;
-  m_multiProcCount            = 0;
-  m_maxWarpCount              = 0;
-  m_maxBlock                  = 0;
-  m_maxSharedWords            = 0;
-  m_maxConcurrency            = 0;
-  m_scratchSpaceCount         = 0;
-  m_scratchFlagsCount         = 0;
-  m_scratchUnifiedCount       = 0;
-  m_scratchUnifiedSupported   = 0;
-  m_streamCount               = 0;
-  m_scratchSpace              = nullptr;
-  m_scratchFlags              = nullptr;
-  m_scratchUnified            = nullptr;
-  m_scratchConcurrentBitset   = nullptr;
-  m_stream                    = nullptr;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_cudaDev                 = -1;
+  m_cudaArch                = -1;
+  m_multiProcCount          = 0;
+  m_maxWarpCount            = 0;
+  m_maxBlock                = 0;
+  m_maxSharedWords          = 0;
+  m_maxConcurrency          = 0;
+  m_scratchSpaceCount       = 0;
+  m_scratchFlagsCount       = 0;
+  m_scratchUnifiedCount     = 0;
+  m_scratchUnifiedSupported = 0;
+  m_streamCount             = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }

 int CudaInternal::verify_is_initialized(const char *const label) const {
@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const {
  }
  return 0 <= m_cudaDev;
 }
-
+uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; }
 CudaInternal &CudaInternal::singleton() {
  static CudaInternal self;
  return self;
 }
+void CudaInternal::fence(const std::string &name) const {
+  Impl::cuda_stream_synchronize(m_stream, this, name);
+}
 void CudaInternal::fence() const {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+  fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }

-void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
+void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream,
+                              bool manage_stream) {
  if (was_finalized)
    Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
  was_initialized = true;
@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
    m_cudaDev    = cuda_device_id;
    m_deviceProp = cudaProp;

-    CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
-    Kokkos::Impl::cuda_device_synchronize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+    Kokkos::Impl::cuda_device_synchronize(
+        "Kokkos::CudaInternal::initialize: Fence on space initialization");

    // Query what compute capability architecture a kernel executes:
    m_cudaArch = cuda_kernel_arch();
@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {

      m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());

-      CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
-                                sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
+                                            sizeof(uint32_t) * buffer_bound));
    }
    //----------------------------------

@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
  // Allocate a staging buffer for constant mem in pinned host memory
  // and an event to avoid overwriting driver for previous kernel launches
  if (stream == nullptr) {
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
-                                  CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
+                       CudaTraits::ConstantMemoryUsage));

-    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
  }

-  m_stream                    = stream;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }

 //----------------------------------------------------------------------------
@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {

    m_scratchFlags = reinterpret_cast<size_type *>(r->data());

-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
        cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
  }

@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor(
  return m_scratchFunctor;
 }

-void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
+std::pair<void *, int> CudaInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+
+  int current_team_scratch = 0;
+  int zero                 = 0;
+  int one                  = 1;
+  while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak(
+      zero, one, std::memory_order_release, std::memory_order_relaxed)) {
+    current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch;
  }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-        m_team_scratch_ptr, m_team_scratch_current_size);
+  if (m_team_scratch_current_size[current_team_scratch] == 0) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
+            "Kokkos::CudaSpace::TeamScratchMemory",
+            m_team_scratch_current_size[current_team_scratch]);
  }
-  return m_team_scratch_ptr;
+  if ((bytes > m_team_scratch_current_size[current_team_scratch]) ||
+      ((bytes < m_team_scratch_current_size[current_team_scratch]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
+            m_team_scratch_ptr[current_team_scratch],
+            m_team_scratch_current_size[current_team_scratch]);
+  }
+  return std::make_pair(m_team_scratch_ptr[current_team_scratch],
+                        current_team_scratch);
 }

 //----------------------------------------------------------------------------
@ -685,36 +742,43 @@ void CudaInternal::finalize() {
    if (m_scratchFunctorSize > 0)
      RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));

-    if (m_team_scratch_current_size > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr);
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      if (m_team_scratch_current_size[i] > 0)
+        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+    }

-    m_cudaDev                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = 0;
-    m_maxSharedWords            = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchUnifiedCount       = 0;
-    m_streamCount               = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_scratchUnified            = nullptr;
-    m_scratchConcurrentBitset   = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+    m_cudaDev                 = -1;
+    m_multiProcCount          = 0;
+    m_maxWarpCount            = 0;
+    m_maxBlock                = 0;
+    m_maxSharedWords          = 0;
+    m_scratchSpaceCount       = 0;
+    m_scratchFlagsCount       = 0;
+    m_scratchUnifiedCount     = 0;
+    m_streamCount             = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
  }

  // only destroy these if we're finalizing the singleton
  if (this == &singleton()) {
-    cudaFreeHost(constantMemHostStaging);
-    cudaEventDestroy(constantMemReusable);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
    auto &deep_copy_space =
        Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
    if (deep_copy_space)
      deep_copy_space->impl_internal_space_instance()->finalize();
-    cudaStreamDestroy(cuda_get_deep_copy_stream());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
  }
 }

@ -823,7 +887,7 @@ Cuda::Cuda()
      "Cuda instance constructor");
 }

-Cuda::Cuda(cudaStream_t stream)
+Cuda::Cuda(cudaStream_t stream, bool manage_stream)
    : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
        ptr->finalize();
        delete ptr;
@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream)
  Impl::CudaInternal::singleton().verify_is_initialized(
      "Cuda instance constructor");
  m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
-                               stream);
+                               stream, manage_stream);
 }

 void Cuda::print_configuration(std::ostream &s, const bool) {
  Impl::CudaInternal::singleton().print_configuration(s);
 }

-void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
+void Cuda::impl_static_fence(const std::string &name) {
+  Kokkos::Impl::cuda_device_synchronize(name);
+}
+void Cuda::impl_static_fence() {
+  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
+}

-void Cuda::fence() const { m_space_instance->fence(); }
+void Cuda::fence() const {
+  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
+}
+void Cuda::fence(const std::string &name) const {
+  m_space_instance->fence(name);
+}

 const char *Cuda::name() { return "Cuda"; }
+uint32_t Cuda::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}

 cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) {
  }
 }

-void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
+void CudaSpaceInitializer::fence() {
+  Kokkos::Cuda::impl_static_fence(
+      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
+}
+void CudaSpaceInitializer::fence(const std::string &name) {
+  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
+  // "+name); //TODO: or this
+  Kokkos::Cuda::impl_static_fence(name);
+}

 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                               const bool detail) {
@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
  msg << "yes\n";
 #else
  msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
-#ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes\n";
-#else
-  msg << "no\n";
 #endif
  msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@ -3,6 +3,9 @@

 #include <vector>
 #include <impl/Kokkos_Tools.hpp>
+#include <atomic>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@ -114,10 +117,14 @@ class CudaInternal {
  mutable size_type* m_scratchFunctor;
  uint32_t* m_scratchConcurrentBitset;
  cudaStream_t m_stream;
+  uint32_t m_instance_id;
+  bool m_manage_stream;

  // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void* m_team_scratch_ptr;
+  int m_n_team_scratch = 10;
+  mutable int64_t m_team_scratch_current_size[10];
+  mutable void* m_team_scratch_ptr[10];
+  mutable std::atomic_int m_team_scratch_pool[10];

  bool was_initialized = false;
  bool was_finalized   = false;
@ -135,7 +142,8 @@ class CudaInternal {
    return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
  }

-  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr,
+                  bool manage_stream = false);
  void finalize();

  void print_configuration(std::ostream&) const;
@ -145,6 +153,7 @@ class CudaInternal {
  static void cuda_set_serial_execution(bool);
 #endif

+  void fence(const std::string&) const;
  void fence() const;

  ~CudaInternal();
@ -175,20 +184,68 @@ class CudaInternal {
        m_scratchFunctor(nullptr),
        m_scratchConcurrentBitset(nullptr),
        m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+        m_instance_id(
+            Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::Cuda>(
+                reinterpret_cast<uintptr_t>(this))) {
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+      m_team_scratch_pool[i]         = 0;
+    }
+  }

  // Resizing of reduction related scratch spaces
  size_type* scratch_space(const size_type size) const;
  size_type* scratch_flags(const size_type size) const;
  size_type* scratch_unified(const size_type size) const;
  size_type* scratch_functor(const size_type size) const;
-
+  uint32_t impl_get_instance_id() const;
  // Resizing of team level 1 scratch
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  std::pair<void*, int> resize_team_scratch_space(std::int64_t bytes,
+                                                  bool force_shrink = false);
 };

 }  // Namespace Impl
+
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_Cuda_instances(std::vector<Cuda>& instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
+    instances[s] = Cuda(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<Cuda> partition_space(const Cuda&, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<Cuda> instances(sizeof...(Args));
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<Cuda> instances(weights.size());
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+}  // namespace Experimental
+
 }  // Namespace Kokkos
 #endif
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func,
 #ifndef KOKKOS_ARCH_KEPLER
  // On Kepler the L1 has no benefit since it doesn't cache reads
  auto set_cache_config = [&] {
-    CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
        func,
        (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
    return prefer_shmem;
@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker<
      params.kernelParams   = (void**)args;
      params.extra          = nullptr;

-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
          &graph_node, graph, /* dependencies = */ nullptr,
          /* numDependencies = */ 0, &params));
    } else {
      // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
    }
    KOKKOS_ENSURES(bool(graph_node))
  }
@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker<
      params.kernelParams   = (void**)args;
      params.extra          = nullptr;

-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
          &graph_node, graph, /* dependencies = */ nullptr,
          /* numDependencies = */ 0, &params));
    } else {
      // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
    }
    KOKKOS_ENSURES(bool(graph_node))
  }
@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker<
                            dim3 const& block, int shmem,
                            CudaInternal const* cuda_instance) {
    // Wait until the previous kernel that uses the constant buffer is done
-    CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventSynchronize(cuda_instance->constantMemReusable));

    // Copy functor (synchronously) to staging buffer in pinned host memory
    unsigned long* staging = cuda_instance->constantMemHostStaging;
@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker<
         get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();

    // Record an event that says when the constant buffer can be reused
-    CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
-                                   cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventRecord(cuda_instance->constantMemReusable,
+                        cudaStream_t(cuda_instance->m_stream)));
  }

 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl<
      base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);

 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      CUDA_SAFE_CALL(cudaGetLastError());
-      cuda_instance->fence();
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      cuda_instance->fence(
+          "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
    }
  }
@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl<
    // the code and the result is visible.
    auto wrap_get_attributes = []() -> cudaFuncAttributes {
      cudaFuncAttributes attr_tmp;
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
          cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
      return attr_tmp;
    };
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@ -81,22 +81,34 @@ namespace Impl {
 CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0};

 void initialize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+#endif
  if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                            sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
-                            sizeof(int) * (Cuda::concurrency())));
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                                        sizeof(int) * (Cuda::concurrency())));
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
  g_host_cuda_lock_arrays.n = Cuda::concurrency();
  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
  init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                  256>>>();
  init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256,
                                    256>>>(Kokkos::Cuda::concurrency());
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
 }

 void finalize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
  if (g_host_cuda_lock_arrays.atomic == nullptr) return;
  cudaFree(g_host_cuda_lock_arrays.atomic);
  g_host_cuda_lock_arrays.atomic = nullptr;
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@ -53,6 +53,10 @@

 #include <Cuda/Kokkos_Cuda_Error.hpp>

+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_Cuda.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {

@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
+
 /* Dan Ibanez: it is critical that this code be a macro, so that it will
   capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
   putting this in an inline function will NOT do the right thing! */
 #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
  {                                                                   \
    if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      CUDA_SAFE_CALL(                                                 \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
          cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
                             &Kokkos::Impl::g_host_cuda_lock_arrays,  \
                             sizeof(Kokkos::Impl::CudaLockArrays)));  \
@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
    lock_array_copied = 1;                                            \
  }

+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif

+#else
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */

 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -62,7 +62,6 @@
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>

 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@ -240,9 +239,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>

  //----------------------------------------

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
  KOKKOS_DEPRECATED inline int vector_length() const {
    return impl_vector_length();
  }
+#endif
  inline int impl_vector_length() const { return m_vector_length; }
  inline int team_size() const { return m_team_size; }
  inline int league_size() const { return m_league_size; }
@ -687,6 +688,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
  int m_shmem_size;
  void* m_scratch_ptr[2];
  int m_scratch_size[2];
+  int m_scratch_pool_id = -1;

  template <class TagType>
  __device__ inline
@ -797,15 +799,19 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
    // Functor's reduce memory, team scan memory, and team shared memory depend
    // upon team size.
    m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }

    const int shmem_size_total = m_shmem_begin + m_shmem_size;
    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@ -829,6 +835,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
          "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
    }
  }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };

 }  // namespace Impl
@ -870,9 +884,24 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
  using value_type     = typename ValueTraits::value_type;
  using reference_type = typename ValueTraits::reference_type;
  using functor_type   = FunctorType;
-  using size_type      = Kokkos::Cuda::size_type;
-  using index_type     = typename Policy::index_type;
-  using reducer_type   = ReducerType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = typename std::conditional<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
+      Kokkos::Cuda::size_type>::type;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;

  // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
  // blockDim.z == 1
@ -883,9 +912,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
  const pointer_type m_result_ptr;
  const bool m_result_ptr_device_accessible;
  const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;

  // Shall we use the shfl based reduction or not (only use it for static sized
  // types of more than 128bit)
@ -924,16 +955,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
      __device__ inline
      void run(const DummySHMEMReductionType& ) const
      {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
+    const integral_nonzero_constant<
+        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
        word_count(ValueTraits::value_size(
                       ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+                   sizeof(word_size_type));

    {
      reference_type value =
          ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
+                          kokkos_impl_cuda_shared_memory<word_size_type>() +
                              threadIdx.y * word_count.value);

      // Number of blocks is bounded so that the reduction can be limited to two
@ -958,11 +989,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
      // This is the final block with the final result at the final threads'
      // location

-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
          m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
              : (m_unified_space ? m_unified_space : m_scratch_space);

      if (threadIdx.y == 0) {
@ -985,17 +1017,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
        if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
                                                WorkTagFwd>(
                ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
+                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
                m_scratch_space, m_scratch_flags)) {
          // This is the final block with the final result at the final threads'
          // location

-          size_type* const shared =
-              kokkos_impl_cuda_shared_memory<size_type>() +
+          word_size_type* const shared =
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
              (blockDim.y - 1) * word_count.value;
-          size_type* const global =
+          word_size_type* const global =
              m_result_ptr_device_accessible
-                  ? reinterpret_cast<size_type*>(m_result_ptr)
+                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
                  : (m_unified_space ? m_unified_space : m_scratch_space);

          if (threadIdx.y == 0) {
@ -1100,15 +1132,21 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,

      KOKKOS_ASSERT(block_size > 0);

-      m_scratch_space = cuda_internal_scratch_space(
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
                                m_functor, m_reducer)) *
                                block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(),
+              ValueTraits::value_size(
+                  ReducerConditional::select(m_functor, m_reducer))));

      // REQUIRED ( 1 , N , 1 )
      dim3 block(1, block_size, 1);
@ -1139,7 +1177,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
          false);  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
+            "Not Device Accessible");

        if (m_result_ptr) {
          if (m_unified_space) {
@ -1459,7 +1499,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
          false);  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+            "Result Not Device Accessible");

        if (m_result_ptr) {
          if (m_unified_space) {
@ -1580,6 +1622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
  size_type m_shmem_size;
  void* m_scratch_ptr[2];
  int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
  const size_type m_league_size;
  int m_team_size;
  const size_type m_vector_size;
@ -1821,7 +1864,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
          true);  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");

        if (m_result_ptr) {
          if (m_unified_space) {
@ -1895,16 +1940,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
    m_scratch_size[0] = m_shmem_size;
    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<std::int64_t>(m_scratch_size[1]) *
-                      (static_cast<std::int64_t>(
-                          Cuda::concurrency() /
-                          (m_team_size * m_vector_size))));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }

    // The global parallel_reduce does not support vector_length other than 1 at
    // the moment
@ -1973,6 +2021,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
    cudaFuncAttributes attr =
        CudaParallelLaunch<ParallelReduce,
                           LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
    m_team_size =
        m_team_size >= 0
            ? m_team_size
@ -1994,15 +2044,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
        FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
    m_scratch_size[0] = m_shmem_size;
    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }

    // The global parallel_reduce does not support vector_length other than 1 at
    // the moment
@ -2030,13 +2084,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
      Kokkos::Impl::throw_runtime_exception(
          std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
    }
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
      Kokkos::Impl::throw_runtime_exception(
          std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                      "large team size."));
    }
  }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };

 }  // namespace Impl
@ -2167,9 +2236,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {

    for (typename Policy::member_type iwork_base = range.begin();
         iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK                            = __activemask();
      const typename Policy::member_type iwork = iwork_base + threadIdx.y;

      __syncthreads();  // Don't overwrite previous iteration values until they
@ -2182,11 +2249,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
      for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
      if (CudaTraits::WarpSize < word_count.value) {
        __syncthreads();
      }  // Protect against large scan values.
@ -2457,9 +2520,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,

    for (typename Policy::member_type iwork_base = range.begin();
         iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK = __activemask();

      const typename Policy::member_type iwork = iwork_base + threadIdx.y;

@ -2474,11 +2535,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
      }

-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
      if (CudaTraits::WarpSize < word_count.value) {
        __syncthreads();
      }  // Protect against large scan values.
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction(
        value_type tmp = Kokkos::shfl_down(value, 1, 32);
        if (id + 1 < int(gridDim.x)) join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2, 32);
        if (id + 2 < int(gridDim.x)) join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4, 32);
        if (id + 4 < int(gridDim.x)) join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8, 32);
        if (id + 8 < int(gridDim.x)) join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16, 32);
        if (id + 16 < int(gridDim.x)) join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
    }
  }
  // The last block has in its thread=0 the global reduction value through
@ -388,48 +368,28 @@ __device__ inline
        value_type tmp = Kokkos::shfl_down(value, 1, 32);
        if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2, 32);
        if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4, 32);
        if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8, 32);
        if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
      if (int(blockDim.x * blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16, 32);
        if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
    }
  }

@ -573,23 +533,17 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
                               // part of the reduction
      const int width)         // How much of the warp participates
  {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
    unsigned mask =
        width == 32
            ? 0xffffffff
            : ((1 << width) - 1)
                  << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
-#endif
    const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
    for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
      if (lane_id + delta < 32) {
        ValueJoin::join(functor, value, value + delta);
      }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(mask);
    }
    *value = *(value - lane_id);
  }
@ -612,17 +566,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
      const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32;
      if (delta < blockDim.x * blockDim.y)
        *my_shared_team_buffer_element = shared_team_buffer_element[delta];
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
      scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false,
                                  blockDim.x * blockDim.y / 32);
      if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
    }
  }

+  template <class SizeType = Cuda::size_type>
  __device__ static inline bool scalar_inter_block_reduction(
      const FunctorType& functor, const Cuda::size_type /*block_id*/,
-      const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-      Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+      const Cuda::size_type block_count, SizeType* const shared_data,
+      SizeType* const global_data, Cuda::size_type* const global_flags) {
    Scalar* const global_team_buffer_element = ((Scalar*)global_data);
    Scalar* const my_global_team_buffer_element =
        global_team_buffer_element + blockIdx.x;
@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan(
  const pointer_type tdata_intra = base_data + value_count * threadIdx.y;

  {  // Intra-warp reduction:
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
  }

  __syncthreads();  // Wait for all warps to reduce
@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan(
    const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
                                << CudaTraits::WarpIndexShift;

-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    unsigned inner_mask =
-        KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y));
-#endif
+    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
    if (rtid_inter < blockDim.y) {
      const pointer_type tdata_inter =
          base_data + value_count * (rtid_inter ^ BlockSizeMask);

-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
      }
      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
      }
      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
      }
      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
      }
      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
      }
-#else
-      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
-      }
-      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
-      }
-      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
-      }
-      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
-      }
-      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
-      }
-#endif

      if (DoScan) {
        int n =
@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan(

        if (!(rtid_inter + n < blockDim.y)) n = 0;

-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#else
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#endif
      }
    }
  }
@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan(
                                              : ((rtid_intra & 16) ? 16 : 0))));

    if (!(rtid_intra + n < blockDim.y)) n = 0;
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
    BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
  }

 #undef BLOCK_SCAN_STEP
@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan(
 *  Global reduce result is in the last threads' 'shared_data' location.
 */

-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
    const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = Cuda::size_type;
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
+  using size_type   = SizeType;
  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
  using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
  using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
  return is_last_block;
 }

-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
    const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
  using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
  if (!DoScan && ValueTraits::StaticValueSize > 0)
    return Kokkos::Impl::CudaReductionsFunctor<
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@ -54,11 +54,27 @@
 #include <Kokkos_Core_fwd.hpp>

 #include <impl/Kokkos_TaskBase.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // KOKKOS_IMPL_CUDA_SAFE_CALL
 #include <impl/Kokkos_TaskTeamMember.hpp>

 //----------------------------------------------------------------------------

+#if defined(__CUDA_ARCH__)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
+  {                                                                        \
+    __syncwarp();                                                          \
+    const unsigned b = __activemask();                                     \
+    if (b != 0xffffffff) {                                                 \
+      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
+             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
+             threadIdx.z, b);                                              \
+      return;                                                              \
+    }                                                                      \
+  }
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
+#endif
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@ -138,13 +154,13 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
      // Broadcast task pointer:

      // Sync before the broadcast
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);

      // pretend it's an int* for shuffle purposes
      ((int*)&current_task)[0] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[0], 0, 32);
      ((int*)&current_task)[1] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[1], 0, 32);

      if (current_task) {
        KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
@ -168,7 +184,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {

        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        if (shared_memory_task_copy->is_team_runnable()) {
          // Thread Team Task
@ -182,7 +198,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
        // Synchronize threads of the warp and insure memory
        // writes are visible to all threads in the warp.

-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
        // b -= b % CudaTraits::WarpSize;
@ -196,7 +212,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
        // writes are visible to root thread of the warp for
        // respawn or completion.

-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        if (warp_lane == 0) {
          // If respawn requested copy respawn data back to main memory
@ -249,12 +265,14 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {

    auto& queue = scheduler.queue();

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Task Execution");

    // Query the stack size, in bytes:

    size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
        cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));

    // If not large enough then set the stack size, in bytes:
@ -262,18 +280,21 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
    const size_t larger_stack_size = 1 << 11;

    if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
    }

    cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
        scheduler, shared_per_warp);

-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Task Execution");

    if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
          cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
    }
  }
@ -295,13 +316,17 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
    destroy_type* dtor_ptr =
        (destroy_type*)((char*)storage + sizeof(function_type));

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Get Function Pointer for Tasks");

    set_cuda_task_base_apply_function_pointer<TaskType>
        <<<1, 1>>>(ptr_ptr, dtor_ptr);

-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Get Function Pointer for Tasks");

    ptr  = *ptr_ptr;
    dtor = *dtor_ptr;
@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained<
          // count of 0 also. Otherwise, returns a task from another queue
          // or `end` if one couldn't be popped
          task_ptr = team_queue.attempt_to_steal_task();
-#if 0
-          if(task != no_more_tasks_sentinel && task != end) {
-            std::printf("task stolen on rank %d\n", team_exec.league_rank());
-          }
-#endif
        }
      }

      // Synchronize warp with memory fence before broadcasting task pointer:

      // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);

      // Broadcast task pointer:

-      ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
-      ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
+      ((int*)&task_ptr)[0] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32);
+      ((int*)&task_ptr)[1] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32);

 #if defined(KOKKOS_ENABLE_DEBUG)
      KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained<
        // writes are visible to all threads in the warp.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        if (task_root_type::TaskTeam == task_shmem->m_task_type) {
          // Thread Team Task
@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained<
        // writes are visible to all threads in the warp.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        // copy task closure from shared to global memory:

@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained<
        // respawn or completion.

        // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);

        // If respawn requested copy respawn data back to main memory

@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained<
    auto& queue = scheduler.queue();
    queue.initialize_team_queues(warps_per_block * grid.x);

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Pre Execute Task");

    // Query the stack size, in bytes:

    size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
        cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));

    // If not large enough then set the stack size, in bytes:
@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained<
    const size_t larger_stack_size = 2048;

    if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
    }

    cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
        scheduler, shared_per_warp);

-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Post Execute Task");

    if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
          cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
    }
  }
@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained<
    destroy_type* dtor_ptr =
        (destroy_type*)((char*)storage + sizeof(function_type));

-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Pre Get Function Pointer");

    set_cuda_task_base_apply_function_pointer<TaskType>
        <<<1, 1>>>(ptr_ptr, dtor_ptr);

-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");

    ptr  = *ptr_ptr;
    dtor = *dtor_ptr;
@ -609,7 +640,7 @@ class TaskExec<Kokkos::Cuda, Scheduler> {

  __device__ void team_barrier() const {
    if (1 < m_team_size) {
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
    }
  }

@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single(
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN
+
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@ -340,191 +340,6 @@ class CudaTeamMember {
 #endif
  }

-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __CUDA_ARCH__
-
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of CUDA threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn =
-        (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift;
-    const int wx = tid & CudaTraits::WarpIndexMask;
-    const int wy = tid >> CudaTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing CUDA threads
-
-      value_type tmp(reducer.reference());
-
-      for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) {
-        Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                 CudaTraits::WarpSize);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        ((value_type*)shmem)[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join(((value_type*)shmem) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy(((pointer_type)global_scratch_space) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = (int)gridDim.x ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      last_block = __syncthreads_or(last_block);
-
-      if (!last_block) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(((pointer_type)shmem) + offset,
-                   ((pointer_type)global_scratch_space) + offset);
-
-      for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) {
-        reducer.join(
-            ((pointer_type)shmem) + offset,
-            ((pointer_type)global_scratch_space) + i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      const int nreduce =
-          CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry;
-
-      // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), (pointer_type)shmem);
-        return 1;
-      }
-    }
-    return 0;
-
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
  //----------------------------------------
  // Private for the driver

@ -533,7 +348,7 @@ class CudaTeamMember {
                 void* scratch_level_1_ptr, const int scratch_level_1_size,
                 const int arg_league_rank, const int arg_league_size)
      : m_team_reduce(shared),
-        m_team_shared(((char*)shared) + shared_begin, shared_size,
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                      scratch_level_1_ptr, scratch_level_1_size),
        m_team_reduce_size(shared_begin),
        m_league_rank(arg_league_rank),
@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
       i += blockDim.x) {
    closure(i);
  }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }

@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single(
  (void)lambda;
 #ifdef __CUDA_ARCH__
  if (threadIdx.x == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }

@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single(
  (void)lambda;
 #ifdef __CUDA_ARCH__
  if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@ -48,7 +48,12 @@
 #ifdef KOKKOS_ENABLE_CUDA

 #include <type_traits>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+
+#if !defined(KOKKOS_COMPILER_CLANG)
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
+#else
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
+#endif

 namespace Kokkos {

@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu;
 // Shuffle operations require input to be a register (stack) variable

 // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width),
-// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK
+// which turns in to one of __shfl_sync(_up|_down)
 // Since the logic with respect to value sizes, etc., is the same everywhere,
 // put it all in one place.
 template <class Derived>
@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> {
    (void)val;
    (void)lane;
    (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width);
+    return __shfl_sync(mask, val, lane, width);
  }
 };
 template <class... Args>
@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> {
  __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val,
                                                  int lane, int width) const
      noexcept {
-    return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width);
+    return __shfl_up_sync(mask, val, lane, width);
  }
 };
 template <class... Args>
@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> {
    (void)val;
    (void)lane;
    (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width);
+    return __shfl_down_sync(mask, val, lane, width);
  }
 };
 template <class... Args>
@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width,

 }  // end namespace Kokkos

+#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF
+
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 #endif  // !defined( KOKKOS_CUDA_VECTORIZATION_HPP )
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
@ -1,49 +0,0 @@
-#include <Kokkos_Macros.hpp>
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
-#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
-#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x)
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x)
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \
-  __shfl_down_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z)
-#else
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
-#define KOKKOS_IMPL_CUDA_SYNCWARP
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
-#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0
-#endif
-
-#if !defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
-#else
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
-#endif
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
-  {                                                                        \
-    __syncwarp();                                                          \
-    const unsigned b = __activemask();                                     \
-    if (b != 0xffffffff) {                                                 \
-      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
-             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
-             threadIdx.z, b);                                              \
-      return;                                                              \
-    }                                                                      \
-  }
-#else
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
-#endif
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@ -45,6 +45,7 @@
 #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP

+#include <functional>
 #include <Kokkos_Macros.hpp>

 #if defined(__HIPCC__)
@ -56,118 +57,239 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {

-template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  // FIXME_HIP - currently the "constant" path is unimplemented.
-  //             we should look at whether it's functional, and
-  //             perform some simple scaling studies to see when /
-  //             if the constant launcher outperforms the current
-  //             pass by pointer shared launcher
-  HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      numBlocks,
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>,
-      blockSize, sharedmem));
-}
+enum class BlockType { Max, Preferred };

-template <typename DriverType, bool constant>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
-      numBlocks, blockSize, sharedmem);
-}
-
-template <class FunctorType, class LaunchBounds, typename F>
-int hip_internal_get_block_size(const F &condition_check,
-                                const HIPInternal *hip_instance,
-                                const hipFuncAttributes &attr,
-                                const FunctorType &f,
-                                const size_t vector_length,
-                                const size_t shmem_block,
-                                const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? HIPTraits::MaxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
-
-  const int regs_per_wavefront  = std::max(attr.numRegs, 1);
-  const int regs_per_sm         = hip_instance->m_regsPerSM;
-  const int shmem_per_sm        = hip_instance->m_shmemPerSM;
-  const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = hip_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = hip_instance->m_maxThreadsPerSM;
-
-  int block_size = max_threads_per_block;
-  KOKKOS_ASSERT(block_size > 0);
-  const int blocks_per_warp =
-      (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize;
-
-  int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-      f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
-  }
-  int opt_block_size =
-      (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
-  int opt_threads_per_sm = threads_per_sm;
-  block_size -= HIPTraits::WarpSize;
-  while (condition_check(blocks_per_sm) &&
-         (block_size >= HIPTraits::WarpSize)) {
-    functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-        f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_preferred_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    if (HIPParallelLaunch<DriverType, LaunchBounds,
+                          LaunchMechanism>::get_scratch_size() > 0) {
+      return HIPTraits::ConservativeThreadsPerBlock;
    }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// FIXME_HIP - entire function could be constexpr for c++17
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_max_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    // we can always fit 1024 threads blocks if we only care about registers
+    // ... and don't mind spilling
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// convenience method to select and return the proper function attributes
+// for a kernel, given the launch bounds et al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          BlockType BlockSize = BlockType::Max,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+hipFuncAttributes get_hip_func_attributes_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    return HIPParallelLaunch<DriverType, LaunchBounds,
+                             LaunchMechanism>::get_hip_func_attributes();
+  } else {
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      return HIPParallelLaunch<
+          DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+          LaunchMechanism>::get_hip_func_attributes();
+    } else {
+      const int blocksize =
+          get_preferred_blocksize_impl<DriverType, LaunchBounds,
+                                       LaunchMechanism>();
+      if (blocksize == HIPTraits::MaxThreadsPerBlock) {
+        return HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
+      } else {
+        return HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
      }
    }
-    block_size -= HIPTraits::WarpSize;
  }
-  return opt_block_size;
 }

-template <class FunctorType, class LaunchBounds>
-int hip_get_max_block_size(const HIPInternal *hip_instance,
-                           const hipFuncAttributes &attr, const FunctorType &f,
-                           const size_t vector_length, const size_t shmem_block,
-                           const size_t shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Given an initial block-size limitation based on register usage
+// determine the block size to select based on LDS limitation
+template <BlockType BlockSize, class DriverType, class LaunchBounds,
+          typename ShmemFunctor>
+unsigned hip_internal_get_block_size(const HIPInternal *hip_instance,
+                                     const ShmemFunctor &f,
+                                     const unsigned tperb_reg) {
+  // translate LB from CUDA to HIP
+  const unsigned min_waves_per_eu =
+      LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1;
+  const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize;
+  const unsigned shmem_per_sm       = hip_instance->m_shmemPerSM;
+  unsigned block_size               = tperb_reg;
+  do {
+    unsigned total_shmem = f(block_size);
+    // find how many threads we can fit with this blocksize based on LDS usage
+    unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size;
+
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      // we want the maximum blocksize possible
+      // just wait until we get a case where we can fit the LDS per SM
+      if (tperb_shmem) return block_size;
+    } else {
+      if (block_size == tperb_reg && tperb_shmem >= tperb_reg) {
+        // fast path for exit on first iteration if registers are more limiting
+        // than LDS usage, just use the register limited size
+        return tperb_reg;
+      }
+      // otherwise we need to apply a heuristic to choose the blocksize
+      // the current launchbound selection scheme is:
+      //      1. If no spills, choose 1024 [MaxThreadsPerBlock]
+      //      2. Otherwise, choose 256 [ConservativeThreadsPerBlock]
+      //
+      // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB
+      // and we'll already have pretty decent occupancy, thus dropping to 256
+      // *probably* isn't a concern
+      const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem;
+      const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg;
+
+      // for anything with > 4 WF's & can fit multiple blocks
+      // we're probably not occupancy limited so just return that
+      if (blocks_per_cu_shmem > 1 &&
+          tperb > HIPTraits::ConservativeThreadsPerBlock) {
+        return block_size;
+      }
+
+      // otherwise, it's probably better to drop to the first valid size that
+      // fits in the ConservativeThreadsPerBlock
+      if (tperb >= min_threads_per_sm) return block_size;
+    }
+    block_size >>= 1;
+  } while (block_size >= HIPTraits::WarpSize);
+  // TODO: return a negative, add an error to kernel launch
+  return 0;
 }

-template <typename FunctorType, typename LaunchBounds>
-int hip_get_opt_block_size(HIPInternal const *hip_instance,
-                           hipFuncAttributes const &attr, FunctorType const &f,
-                           size_t const vector_length, size_t const shmem_block,
-                           size_t const shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int) { return true; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the preferred blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_preferred_blocksize() {
+  return get_preferred_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the max blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_max_blocksize() {
+  return get_max_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance,
+                                     ShmemFunctor const &f) {
+  // get preferred blocksize limited by register usage
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
+                                          ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds,
+                                   BlockType::Preferred>();
+  // get preferred blocksize limited by register usage
+  using namespace std::placeholders;
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_max_blocksize(HIPInternal const *hip_instance,
+                               ShmemFunctor const &f) {
+  // get max blocksize limited by register usage
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
+                                    ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
+  // get max blocksize
+  using namespace std::placeholders;
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
 }

 }  // namespace Impl
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name,
  }
 }

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name,
+                                              const char* file = nullptr,
+                                              const int line   = 0) {
+  hip_internal_safe_call(e, name, file, line);
+}
+
+#endif
+
 }  // namespace Impl
 }  // namespace Kokkos

-#define HIP_SAFE_CALL(call) \
+#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \
  Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__)

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define HIP_SAFE_CALL(call)                                              \
+  Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                  __LINE__)
+
+#endif
+
 namespace Kokkos {
 namespace Experimental {

--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@ -77,7 +77,7 @@ class HIPInternalDevices {
 };

 HIPInternalDevices::HIPInternalDevices() {
-  HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));

  if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
    Kokkos::abort(
@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() {
        "have. Please report this to github.com/kokkos/kokkos.");
  }
  for (int i = 0; i < m_hipDevCount; ++i) {
-    HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
  }
 }

@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() {
 }
 }  // namespace

+unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr;
+hipEvent_t Impl::HIPInternal::constantMemReusable        = nullptr;
+
 namespace Impl {

 //----------------------------------------------------------------------------
@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const {
  return 0 <= m_hipDev;
 }

+uint32_t HIPInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
 HIPInternal &HIPInternal::singleton() {
  static HIPInternal *self = nullptr;
  if (!self) {
@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() {
 }

 void HIPInternal::fence() const {
-  HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
-  // can reset our cycle id now as well
-  m_cycleId = 0;
+  fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence");
+}
+void HIPInternal::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_get_instance_id()},
+      [&]() {
+        KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+        // can reset our cycle id now as well
+        m_cycleId = 0;
+      });
 }

-void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
+void HIPInternal::initialize(int hip_device_id, hipStream_t stream,
+                             bool manage_stream) {
  if (was_finalized)
    Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");

@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
    m_hipDev     = hip_device_id;
    m_deviceProp = hipProp;

-    HIP_SAFE_CALL(hipSetDevice(m_hipDev));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev));

    m_stream                    = stream;
+    m_manage_stream             = manage_stream;
    m_team_scratch_current_size = 0;
    m_team_scratch_ptr          = nullptr;

@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
    // theoretically, we can get 40 WF's / CU, but only can sustain 32
    // see
    // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
-    m_maxBlocksPerSM = 32;
+    m_maxWavesPerCU = 32;
    // FIXME_HIP - Nick to implement this upstream
    //             Register count comes from Sec. 2.2. "Data Sharing" of the
    //             Vega 7nm ISA document (see the diagram)
@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
    m_regsPerSM        = 65536;
    m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
    m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
-    m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
+    m_maxThreadsPerSM  = m_maxWavesPerCU * HIPTraits::WarpSize;
    //----------------------------------
    // Multiblock reduction uses scratch flags for counters
    // and scratch space for partial reduction values.
@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {

      m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());

-      HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
-                              sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
+                                          sizeof(uint32_t) * buffer_bound));
    }
    //----------------------------------

@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {

  // Init the array for used for arbitrarily sized atomics
  if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (m_stream == nullptr) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging,
+                                            HIPTraits::ConstantMemoryUsage));
+
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable));
+  }
 }

 //----------------------------------------------------------------------------
@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(

    m_scratchFlags = reinterpret_cast<size_type *>(r->data());

-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
        hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
  }

@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------

 void HIPInternal::finalize() {
-  this->fence();
+  this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
  was_finalized = true;
  if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
    using RecordHIP =
@ -378,6 +405,9 @@ void HIPInternal::finalize() {
    if (m_team_scratch_current_size > 0)
      Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);

+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+
    m_hipDev                    = -1;
    m_hipArch                   = -1;
    m_multiProcCount            = 0;
@ -395,28 +425,36 @@ void HIPInternal::finalize() {
    m_team_scratch_ptr          = nullptr;
  }
  if (nullptr != d_driverWorkArray) {
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
    d_driverWorkArray = nullptr;
  }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 }

 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
  std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
  if (d_driverWorkArray == nullptr) {
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
        hipHostMalloc(&d_driverWorkArray,
                      m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                      hipHostMallocNonCoherent));
  }
  if (driverTypeSize > m_maxDriverTypeSize) {
    // fence handles the cycle id reset for us
-    fence();
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    fence(
+        "Kokkos::HIPInternal::get_next_driver: fence before reallocating "
+        "resources");
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
    m_maxDriverTypeSize = driverTypeSize;
    if (m_maxDriverTypeSize % 128 != 0)
      m_maxDriverTypeSize =
          m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
        hipHostMalloc(&d_driverWorkArray,
                      m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                      hipHostMallocNonCoherent));
@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
    m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
    if (m_cycleId == 0) {
      // ensure any outstanding kernels are completed before we wrap around
-      fence();
+      fence(
+          "Kokkos::HIPInternal::get_next_driver: fence before reusing first "
+          "driver");
    }
  }
  return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(

 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+void hip_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}

 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                              const int line) {
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@ -48,6 +48,7 @@
 #define KOKKOS_HIP_INSTANCE_HPP

 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>

 #include <mutex>

@ -59,10 +60,12 @@ struct HIPTraits {
  static int constexpr WarpSize       = 64;
  static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
  static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+  static int constexpr ConservativeThreadsPerBlock =
+      256;  // conservative fallback blocksize in case of spills
  static int constexpr MaxThreadsPerBlock =
-      1024;  // FIXME_HIP -- assumed constant for now
-
+      1024;  // the maximum we can fit in a block
  static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
  static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };

@ -90,7 +93,7 @@ class HIPInternal {
  unsigned m_multiProcCount = 0;
  unsigned m_maxWarpCount   = 0;
  unsigned m_maxBlock       = 0;
-  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxWavesPerCU  = 0;
  unsigned m_maxSharedWords = 0;
  int m_regsPerSM;
  int m_shmemPerSM       = 0;
@ -108,6 +111,8 @@ class HIPInternal {
  mutable int m_cycleId = 0;
  // mutex to access d_driverWorkArray
  mutable std::mutex m_mutexWorkArray;
+  // mutex to access shared memory
+  mutable std::mutex m_mutexSharedMemory;

  // Scratch Spaces for Reductions
  size_type m_scratchSpaceCount = 0;
@ -119,7 +124,10 @@ class HIPInternal {

  hipDeviceProp_t m_deviceProp;

-  hipStream_t m_stream = nullptr;
+  hipStream_t m_stream   = nullptr;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::HIP>(reinterpret_cast<uintptr_t>(this));
+  bool m_manage_stream = false;

  // Team Scratch Level 1 Space
  mutable int64_t m_team_scratch_current_size = 0;
@ -128,18 +136,25 @@ class HIPInternal {

  bool was_finalized = false;

+  // FIXME_HIP: these want to be per-device, not per-stream...  use of 'static'
+  // here will break once there are multiple devices though
+  static unsigned long *constantMemHostStaging;
+  static hipEvent_t constantMemReusable;
+
  static HIPInternal &singleton();

  int verify_is_initialized(const char *const label) const;

  int is_initialized() const { return m_hipDev >= 0; }

-  void initialize(int hip_device_id, hipStream_t stream = nullptr);
+  void initialize(int hip_device_id, hipStream_t stream = nullptr,
+                  bool manage_stream = false);
  void finalize();

  void print_configuration(std::ostream &) const;

  void fence() const;
+  void fence(const std::string &) const;

  // returns the next driver type pointer in our work array
  char *get_next_driver(size_t driverTypeSize) const;
@ -151,13 +166,52 @@ class HIPInternal {
  // Resizing of reduction related scratch spaces
  size_type *scratch_space(const size_type size);
  size_type *scratch_flags(const size_type size);
-
+  uint32_t impl_get_instance_id() const noexcept;
  // Resizing of team level 1 scratch
  void *resize_team_scratch_space(std::int64_t bytes,
                                  bool force_shrink = false);
 };

 }  // namespace Impl
+
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<HIP> partition_space(const HIP &, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  std::vector<HIP> instances(sizeof...(Args));
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<HIP> instances(weights.size());
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
 }  // namespace Experimental
 }  // namespace Kokkos

--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@ -52,6 +52,7 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Locks.hpp>

 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@ -64,7 +65,7 @@ namespace Kokkos {
 namespace Experimental {
 template <typename T>
 inline __device__ T *kokkos_impl_hip_shared_memory() {
-  HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
+  extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[];
  return (T *)sh;
 }
 }  // namespace Experimental
@ -74,10 +75,12 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {

+// The hip_parallel_launch_*_memory code is identical to the cuda code
 template <typename DriverType>
 __global__ static void hip_parallel_launch_constant_memory() {
  const DriverType &driver = *(reinterpret_cast<const DriverType *>(
      kokkos_impl_hip_constant_memory_buffer));
+
  driver();
 }

@ -87,12 +90,13 @@ __global__ __launch_bounds__(
  const DriverType &driver = *(reinterpret_cast<const DriverType *>(
      kokkos_impl_hip_constant_memory_buffer));

-  driver->operator()();
+  driver();
 }

 template <class DriverType>
 __global__ static void hip_parallel_launch_local_memory(
    const DriverType *driver) {
+  // FIXME_HIP driver() pass by copy
  driver->operator()();
 }

@ -101,6 +105,21 @@ __global__ __launch_bounds__(
    maxTperB,
    minBperSM) static void hip_parallel_launch_local_memory(const DriverType
                                                                *driver) {
+  // FIXME_HIP driver() pass by copy
+  driver->operator()();
+}
+
+template <typename DriverType>
+__global__ static void hip_parallel_launch_global_memory(
+    const DriverType *driver) {
+  driver->operator()();
+}
+
+template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
+__global__ __launch_bounds__(
+    maxTperB,
+    minBperSM) static void hip_parallel_launch_global_memory(const DriverType
+                                                                 *driver) {
  driver->operator()();
 }

@ -127,33 +146,238 @@ struct HIPDispatchProperties {
  HIPLaunchMechanism launch_mechanism = l;
 };

+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+// The following code is identical to the cuda code
+template <typename DriverType>
+struct DeduceHIPLaunchMechanism {
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t
+      heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
+  static constexpr typename DriverType::Policy::work_item_property property =
+      typename DriverType::Policy::work_item_property();
+
+  static constexpr HIPLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::Default) |
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+           ? HIPLaunchMechanism::ConstantMemory
+           : HIPLaunchMechanism::Default) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism requested_launch_mechanism =
+      (((property & light_weight) == light_weight)
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::ConstantMemory) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold)
+          ? HIPLaunchMechanism::LocalMemory
+          : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage)
+                 ? HIPLaunchMechanism::ConstantMemory
+                 : HIPLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG  LCG L  L       LCG  LG L  L   LCG  CG L  C
+  // UseT<F<KAL   LCG  LCG C  C       LCG  LG C  L   LCG  CG C  C
+  // Kal<F<CMU     CG  LCG C  C        CG  LG C  G    CG  CG C  C
+  // CMU<F          G  LCG G  G         G  LG G  G     G  CG G  G
+  static constexpr HIPLaunchMechanism launch_mechanism =
+      ((property & light_weight) == light_weight)
+          ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+                 ? HIPLaunchMechanism::LocalMemory
+                 : HIPLaunchMechanism::GlobalMemory)
+          : (((property & heavy_weight) == heavy_weight)
+                 ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+                        ? HIPLaunchMechanism::ConstantMemory
+                        : HIPLaunchMechanism::GlobalMemory)
+                 : (default_launch_mechanism));
+};
+
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFuncData {
+  static unsigned int get_scratch_size(
+      hipFuncAttributes const &hip_func_attributes) {
+    return hip_func_attributes.localSizeBytes;
+  }
+
+  static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) {
+    static hipFuncAttributes attr = [=]() {
+      hipFuncAttributes attr;
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func));
+      return attr;
+    }();
+    return attr;
+  }
+};
+
+//---------------------------------------------------------------//
+// HIPParallelLaunchKernelFunc structure and its specializations //
+//---------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
          HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelFunc;

+// HIPLaunchMechanism::LocalMemory specializations
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
          unsigned int MinBlocksPerSM>
 struct HIPParallelLaunchKernelFunc<
    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
    HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
  static auto get_kernel_func() {
    return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
                                            MinBlocksPerSM>;
  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
 };

 template <typename DriverType>
 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
                                   HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::LocalMemory>;
  static auto get_kernel_func() {
-    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
+    return HIPParallelLaunchKernelFunc<
+        DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+        HIPLaunchMechanism::LocalMemory>::get_kernel_func();
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
  }
 };

+// HIPLaunchMechanism::GlobalMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType>;
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType>;
+  }
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+//------------------------------------------------------------------//
+// HIPParallelLaunchKernelInvoker structure and its specializations //
+//------------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
          HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelInvoker;

+// HIPLaunchMechanism::LocalMemory specialization
 template <typename DriverType, typename LaunchBounds>
 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
                                      HIPLaunchMechanism::LocalMemory>
@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
  }
 };

+// HIPLaunchMechanism::GlobalMemory specialization
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::GlobalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::GlobalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::GlobalMemory>;
+
+  // FIXME_HIP the code is different than cuda because driver cannot be passed
+  // by copy
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::ConstantMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory> {
+  using base_t =
+      HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory>;
+  static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage,
+                "Kokkos Error: Requested HIPLaunchConstantMemory with a "
+                "Functor larger than 32kB.");
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    // Wait until the previous kernel that uses the constant buffer is done
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipEventSynchronize(hip_instance->constantMemReusable));
+
+    // Copy functor (synchronously) to staging buffer in pinned host memory
+    unsigned long *staging = hip_instance->constantMemHostStaging;
+    std::memcpy((void *)staging, (void *)driver, sizeof(DriverType));
+
+    // Copy functor asynchronously from there to constant memory on the device
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(
+        HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging,
+        sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream));
+
+    // Invoke the driver function on the device
+    (base_t::
+         get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>();
+
+    // Record an event that says when the constant buffer can be reused
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable,
+                                             hip_instance->m_stream));
+  }
+};
+
+//-----------------------------//
+// HIPParallelLaunch structure //
+//-----------------------------//
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
 struct HIPParallelLaunch;

 template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
+          unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunch<
    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    HIPLaunchMechanism::LocalMemory>
+    LaunchMechanism>
    : HIPParallelLaunchKernelInvoker<
          DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          HIPLaunchMechanism::LocalMemory> {
+          LaunchMechanism> {
  using base_t = HIPParallelLaunchKernelInvoker<
      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-      HIPLaunchMechanism::LocalMemory>;
+      LaunchMechanism>;

  HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
                    const dim3 &block, const int shmem,
@ -205,22 +491,48 @@ struct HIPParallelLaunch<
      base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);

 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError());
+      hip_instance->fence(
+          "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for "
+          "Execution Error");
 #endif
    }
  }
-
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
-      return attr;
-    }();
-    return attr;
-  }
 };
+
+// convenience method to launch the correct kernel given the launch bounds et
+// al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
+                         const dim3 &block, const int shmem,
+                         const HIPInternal *hip_instance,
+                         const bool prefer_shmem) {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+        driver, grid, block, shmem, hip_instance, prefer_shmem);
+  } else {
+    // we can do what we like
+    const unsigned flat_block_size = block.x * block.y * block.z;
+    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+      // we have to use the large blocksize
+      HIPParallelLaunch<
+          DriverType,
+          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                           prefer_shmem);
+    } else {
+      HIPParallelLaunch<DriverType,
+                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+                        LaunchMechanism>(driver, grid, block, shmem,
+                                         hip_instance, prefer_shmem);
+    }
+  }
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@ -84,11 +84,17 @@ namespace Impl {
 HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0};

 void initialize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+#endif
+
  if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
      &g_host_hip_lock_arrays.atomic,
      sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
      &g_host_hip_lock_arrays.scratch,
      sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency())));

@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() {
 }

 void finalize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
  if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
  g_host_hip_lock_arrays.atomic = nullptr;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
  g_host_hip_lock_arrays.scratch = nullptr;
  g_host_hip_lock_arrays.n       = 0;
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
@ -51,6 +51,10 @@

 #include <HIP/Kokkos_HIP_Error.hpp>

+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_HIP.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {

@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
  {                                                             \
    if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      HIP_SAFE_CALL(hipMemcpyToSymbol(                          \
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
          HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
          &::Kokkos::Impl::g_host_hip_lock_arrays,              \
          sizeof(::Kokkos::Impl::HIPLockArrays)));              \
@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
    ::Kokkos::Impl::lock_array_copied = 1;                      \
  }

+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
 #else
@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
 #endif

+#else
+
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( __HIPCC__ ) */

 #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
      space.impl_internal_space_instance()->m_maxThreadsPerSM;
  properties.default_largest_tile_size = 16;
  properties.default_tile_size         = 4;
-  properties.max_total_tile_size       = 1024;
+  properties.max_total_tile_size =
+      Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
  return properties;
 }

--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@ -81,6 +81,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
  }

  inline void execute() const {
+    using ClosureType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
    if (m_policy.m_num_tiles == 0) return;
    array_index_type const maxblocks = static_cast<array_index_type>(
        m_policy.space().impl_internal_space_instance()->m_maxBlock);
@ -94,7 +96,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       block.y,
                   maxblocks),
          1);
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, 0,
          m_policy.space().impl_internal_space_instance(), false);
    } else if (Policy::rank == 3) {
@ -110,7 +113,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
          std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
                       block.z,
                   maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, 0,
          m_policy.space().impl_internal_space_instance(), false);
    } else if (Policy::rank == 4) {
@ -128,7 +132,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
          std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
                       block.z,
                   maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, 0,
          m_policy.space().impl_internal_space_instance(), false);
    } else if (Policy::rank == 5) {
@ -147,7 +152,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
          std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
                       block.z,
                   maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, 0,
          m_policy.space().impl_internal_space_instance(), false);
    } else if (Policy::rank == 6) {
@ -165,7 +171,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                      std::min(static_cast<index_type>(m_policy.m_tile_end[4] *
                                                       m_policy.m_tile_end[5]),
                               static_cast<index_type>(maxblocks)));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, 0,
          m_policy.space().impl_internal_space_instance(), false);
    } else {
@ -178,22 +185,18 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
      : m_functor(arg_functor), m_policy(arg_policy) {}

  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
    using closure_type =
        ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                    Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
  }
 };

@ -242,6 +245,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  const bool m_result_ptr_device_accessible;
  size_type* m_scratch_space;
  size_type* m_scratch_flags;
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;

  using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
      Policy::rank, Policy, FunctorType, WorkTag, reference_type>;
@ -307,32 +313,30 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
  // Determine block size constrained by shared memory:
  // This is copy/paste from Kokkos_HIP_Parallel_Range
  inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-        false, FunctorType, WorkTag>(f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-          false, FunctorType, WorkTag>(f, n);
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<closure_type,
+                                                                LaunchBounds>(
+            instance, shmem_functor);
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
    }
-    return n;
+    return block_size;
  }

  inline void execute() {
-    const int nwork = m_policy.m_num_tiles;
+    using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                       Kokkos::Experimental::HIP>;
+    const int nwork   = m_policy.m_num_tiles;
    if (nwork) {
      int block_size = m_policy.m_prod_tile_dims;
      // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
@ -366,14 +370,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
          ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
              false, FunctorType, WorkTag>(m_functor, block.y);

-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
          *this, grid, block, shmem,
          m_policy.space().impl_internal_space_instance(),
          false);  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");

        if (m_result_ptr) {
          const int size = ValueTraits::value_size(
@ -403,7 +409,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
            MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
                              typename ViewType::memory_space>::accessible),
        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}

  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                 const ReducerType& reducer)
@ -416,23 +425,25 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                              typename ReducerType::result_view_type::
                                  memory_space>::accessible),
        m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
+
  template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
    using closure_type =
        ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       ReducerType, Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
+    }
+    return block_size;
  }
 };
 }  // namespace Impl
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@ -108,16 +108,21 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
  inline void execute() const {
    const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();

+    using DriverType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
    const int block_size =
-        LaunchBounds::maxTperB
-            ? LaunchBounds::maxTperB
-            : ::Kokkos::Experimental::Impl::HIPTraits::
-                  MaxThreadsPerBlock;  // FIXME_HIP Choose block_size better
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<DriverType,
+                                                                LaunchBounds>();
    const dim3 block(1, block_size, 1);
    const dim3 grid(
        typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);

-    Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
        *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
        false);
  }
@ -173,15 +178,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
  const bool m_result_ptr_host_accessible;
  size_type* m_scratch_space = nullptr;
  size_type* m_scratch_flags = nullptr;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;

-#if HIP_VERSION < 401
-  static bool constexpr UseShflReduction =
-      ((sizeof(value_type) > 2 * sizeof(double)) &&
-       static_cast<bool>(ValueTraits::StaticValueSize));
-#else
  static bool constexpr UseShflReduction =
      static_cast<bool>(ValueTraits::StaticValueSize);
-#endif

 private:
  struct ShflReductionTag {};
@ -328,30 +330,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,

  // Determine block size constrained by shared memory:
  inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size =
-        hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned int>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size =
-          hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-              f, n);
-    }
-    return n;
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                      Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, LaunchBounds>(instance, shmem_functor);
  }

  inline void execute() {
@ -362,7 +349,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                 !std::is_same<ReducerType, InvalidType>::value;
    if ((nwork > 0) || need_device_set) {
      const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }

      m_scratch_space =
          ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
@ -391,14 +382,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                         WorkTag>(m_functor,
                                                                  block.y);

-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
          *this, grid, block, shmem,
          m_policy.space().impl_internal_space_instance(),
          false);  // copy to device and execute

      if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
+        m_policy.space().impl_internal_space_instance()->fence(
+            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");

        if (m_result_ptr) {
          const int size = ValueTraits::value_size(
@ -429,7 +423,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                              typename ViewType::memory_space>::accessible),
        m_result_ptr_host_accessible(
            MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}

  ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                 const ReducerType& reducer)
@ -444,7 +441,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
        m_result_ptr_host_accessible(
            MemorySpaceAccess<Kokkos::HostSpace,
                              typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
+                                  memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };

 template <class FunctorType, class... Traits>
@ -482,6 +482,9 @@ class ParallelScanHIPBase {
  size_type* m_scratch_flags = nullptr;
  size_type m_final          = false;
  int m_grid_x               = 0;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;

 private:
  template <class TagType>
@ -624,22 +627,7 @@ class ParallelScanHIPBase {
  }

  // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
-    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // TODO check best option
-
-    unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4;
-    while (n && static_cast<unsigned>(m_policy.space()
-                                          .impl_internal_space_instance()
-                                          ->m_maxShmemPerBlock) <
-                    hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
+  virtual inline unsigned local_block_size(const FunctorType& f) = 0;

  inline void impl_execute() {
    const index_type nwork = m_policy.end() - m_policy.begin();
@ -649,7 +637,11 @@ class ParallelScanHIPBase {
      const int gridMaxComputeCapability_2x = 0x01fff;

      const int block_size = static_cast<int>(local_block_size(m_functor));
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
+                        "valid execution configuration."));
+      }

      const int grid_max =
          std::min(block_size * block_size, gridMaxComputeCapability_2x);
@ -674,15 +666,16 @@ class ParallelScanHIPBase {
      const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);

      m_final = false;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      // these ones are OK to be just the base because the specializations
+      // do not modify the kernel at all
+      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
          *this, grid, block, shmem,
          m_policy.space().impl_internal_space_instance(),
          false);  // copy to device and execute

      m_final = true;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
          *this, grid, block, shmem,
          m_policy.space().impl_internal_space_instance(),
          false);  // copy to device and execute
@ -690,13 +683,17 @@ class ParallelScanHIPBase {
  }

  ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };

 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                   Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
 public:
  using Base = ParallelScanHIPBase<FunctorType, Traits...>;
  using Base::operator();
@ -706,6 +703,23 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
  ParallelScan(const FunctorType& arg_functor,
               const typename Base::Policy& arg_policy)
      : Base(arg_functor, arg_policy) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType = ParallelScan<FunctorType, typename Base::Policy,
+                                    Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };

 //----------------------------------------------------------------------------
@ -713,7 +727,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                            ReturnType, Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
 public:
  using Base = ParallelScanHIPBase<FunctorType, Traits...>;
  using Base::operator();
@ -737,6 +751,24 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                        const typename Base::Policy& arg_policy,
                        ReturnType& arg_returnvalue)
      : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType =
+        ParallelScanWithTotal<FunctorType, typename Base::Policy, ReturnType,
+                              Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };

 }  // namespace Impl
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@ -56,20 +56,20 @@

 namespace Kokkos {
 namespace Impl {
+
 template <typename... Properties>
 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
    : public PolicyTraits<Properties...> {
 public:
  using execution_policy = TeamPolicyInternal;

-  using traits = PolicyTraits<Properties...>;
+  using traits    = PolicyTraits<Properties...>;
+  using BlockType = Kokkos::Experimental::Impl::BlockType;

  template <typename ExecSpace, typename... OtherProperties>
  friend class TeamPolicyInternal;

 private:
-  static int constexpr MAX_WARP = 8;
-
  typename traits::execution_space m_space;
  int m_league_size;
  int m_team_size;
@ -101,17 +101,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
  template <typename FunctorType>
  int team_size_max(FunctorType const& f, ParallelForTag const&) const {
    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type>(f);
  }

  template <class FunctorType>
@ -129,8 +121,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
    return internal_team_size_max<closure_type>(f);
  }

-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
                           const ParallelReduceTag&) const {
    using closure_type =
        Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
@ -141,17 +133,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
  template <typename FunctorType>
  int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
    using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
  }

  template <typename FunctorType>
@ -169,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
    return internal_team_size_recommended<closure_type>(f);
  }

-  template <class FunctorType, class ReducerType>
+  template <typename FunctorType, typename ReducerType>
  int team_size_recommended(FunctorType const& f, ReducerType const&,
                            ParallelReduceTag const&) const {
    using closure_type =
@ -177,6 +161,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
                             ReducerType>;
    return internal_team_size_recommended<closure_type>(f);
  }
+
  inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
  inline bool impl_auto_team_size() const { return m_tune_team_size; }
  static int vector_length_max() {
@ -211,7 +196,10 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
  inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
  inline void impl_set_team_size(size_t size) { m_team_size = size; }
  int impl_vector_length() const { return m_vector_length; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
  KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif

  int team_size() const { return m_team_size; }

@ -266,7 +254,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
          "space.");

    // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > 1024) {
+    if (m_team_size * m_vector_length >
+        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) {
      Impl::throw_runtime_exception(
          std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
                      "Team size x vector length must be smaller than 1024."));
@ -363,26 +352,84 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
  using member_type = Kokkos::Impl::HIPTeamMember;

 protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common(const FunctorType& f) const {
+    // FIXME_HIP: this could be unified with the
+    // internal_team_size_common_reduce
+    //            once we can turn c++17 constexpr on by default.
+    //            The problem right now is that we can't turn off the evaluation
+    //            of the functor_value_traits's valuesize / StaticValueSize
+
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
+    const int vector_length     = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "team size."));
+    }
+    return block_size / impl_vector_length();
+  }
+
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common_reduce(const FunctorType& f) const {
    using functor_value_traits =
        Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;

-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
+                                  ((functor_value_traits::StaticValueSize != 0)
+                                       ? 0
+                                       : functor_value_traits::value_size(f));
+    const int vector_length = impl_vector_length();

+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid team size."));
+    }
    // Currently we require Power-of-2 team size for reductions.
    int p2 = 1;
    while (p2 <= block_size) p2 *= 2;
@ -392,16 +439,13 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>

  template <class ClosureType, class FunctorType>
  int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Max, ClosureType>(f);
  }

  template <class ClosureType, class FunctorType>
  int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Preferred, ClosureType>(
+        f);
  }
 };

@ -505,7 +549,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
    dim3 const block(static_cast<int>(m_vector_size),
                     static_cast<int>(m_team_size), 1);

-    ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                    Kokkos::Experimental::HIP>;
+    ::Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
        *this, grid, block, shmem_size_total,
        m_policy.space().impl_internal_space_instance(),
        true);  // copy to device and execute
@ -520,17 +568,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
        m_scratch_lock(m_policy.space()
                           .impl_internal_space_instance()
                           ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelFor, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-                  FunctorType, launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());

    m_shmem_begin = (sizeof(double) * (m_team_size + 2));
    m_shmem_size =
@ -556,23 +596,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
    int const shmem_size_total = m_shmem_begin + m_shmem_size;
    if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
        shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
      Kokkos::Impl::throw_runtime_exception(std::string(
          "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
    }

-    if (static_cast<int>(m_team_size) >
-        static_cast<int>(
-            ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                 launch_bounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
      Kokkos::Impl::throw_runtime_exception(std::string(
          "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
    }
@ -839,8 +868,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
      }
      const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;

-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    launch_bounds>(
+      using closure_type =
+          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                         ReducerType, Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
          *this, grid, block, shmem_size_total,
          m_policy.space().impl_internal_space_instance(),
          true);  // copy to device and execute
@ -890,17 +922,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
        m_scratch_lock(m_policy.space()
                           .impl_internal_space_instance()
                           ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelReduceTag());

    m_team_begin =
        UseShflReduction
@ -958,8 +982,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      "L0 scratch memory"));
    }

-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
      Kokkos::Impl::throw_runtime_exception(
          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                      "large team size."));
@ -992,18 +1017,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
        m_scratch_lock(m_policy.space()
                           .impl_internal_space_instance()
                           ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
+    m_team_size = m_team_size >= 0
+                      ? m_team_size
+                      : arg_policy.team_size_recommended(arg_functor, reducer,
+                                                         ParallelReduceTag());
    m_team_begin =
        UseShflReduction
            ? 0
@ -1046,7 +1063,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
    // upon team size.

    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
    if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
         !UseShflReduction) ||
        m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@ -1054,8 +1070,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
      Kokkos::Impl::throw_runtime_exception(
          std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
    }
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
      Kokkos::Impl::throw_runtime_exception(
          std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                      "large team size."));
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@ -67,102 +67,32 @@ namespace {
 hipStream_t get_deep_copy_stream() {
  static hipStream_t s = nullptr;
  if (s == nullptr) {
-    HIP_SAFE_CALL(hipStreamCreate(&s));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s));
  }
  return s;
 }
 }  // namespace

-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
+void DeepCopyHIP(void* dst, void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
 }

-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(
      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
 }

 void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
  hipStream_t s = get_deep_copy_stream();
-  HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
-  HIP_SAFE_CALL(hipStreamSynchronize(s));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy "
+      "stream",
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); });
 }

 }  // namespace Impl
@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {

 KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
 }

 }  // namespace Kokkos
+#endif

 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate(
    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                      reported_size);
  }
-  HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
 }

 void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr,
@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate(
    Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                      reported_size);
  }
-  HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }

 }  // namespace Experimental
@ -427,23 +359,42 @@ HIP::HIP()
      "HIP instance constructor");
 }

-HIP::HIP(hipStream_t const stream)
+HIP::HIP(hipStream_t const stream, bool manage_stream)
    : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
        ptr->finalize();
        delete ptr;
      }) {
  Impl::HIPInternal::singleton().verify_is_initialized(
      "HIP instance constructor");
-  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
+  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream,
+                               manage_stream);
 }

 void HIP::print_configuration(std::ostream& s, const bool) {
  Impl::HIPInternal::singleton().print_configuration(s);
 }

-void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+uint32_t HIP::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+void HIP::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
+void HIP::impl_static_fence() {
+  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
+}

-void HIP::fence() const { m_space_instance->fence(); }
+void HIP::fence(const std::string& name) const {
+  m_space_instance->fence(name);
+}
+void HIP::fence() const {
+  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
+}

 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }

@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) {
 void HIPSpaceInitializer::fence() {
  Kokkos::Experimental::HIP::impl_static_fence();
 }
+void HIPSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::HIP::impl_static_fence(name);
+}

 void HIPSpaceInitializer::print_configuration(std::ostream& msg,
                                              const bool detail) {
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@ -316,198 +316,6 @@ class HIPTeamMember {
 #endif
  }

-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __HIP_DEVICE_COMPILE__
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of HIP threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >>
-                   Experimental::Impl::HIPTraits::WarpIndexShift;
-    const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask;
-    const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing HIP threads
-
-      value_type tmp(reducer.reference());
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) {
-        Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                               warp_size);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        (reinterpret_cast<value_type*>(shmem))[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                         (reinterpret_cast<pointer_type>(shmem)) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = static_cast<int>(gridDim.x) ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      // FIXME hip does not support __syncthreads_or so we need to do it by hand
-      // last_block = __syncthreads_or(last_block);
-
-      __shared__ int last_block_shared;
-      if (last_block) last_block_shared = last_block;
-      __threadfence_block();
-
-      if (!last_block_shared) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(
-          (reinterpret_cast<pointer_type>(shmem)) + offset,
-          (reinterpret_cast<pointer_type>(global_scratch_space)) + offset);
-
-      for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) {
-        reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset,
-                     (reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      const int nreduce = warp_size < nentry ? warp_size : nentry;
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                       (reinterpret_cast<pointer_type>(shmem)) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem));
-        return 1;
-      }
-    }
-    return 0;
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
  //----------------------------------------
  // Private for the driver

--- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) {
 }

 void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); }
+void HPXSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Experimental::HPX().fence(name);
+}

 void HPXSpaceInitializer::print_configuration(std::ostream &msg,
                                              const bool detail) {
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)`