diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index fe6c17801e..25211268e9 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -39,8 +39,8 @@ if(DOWNLOAD_KOKKOS)
   list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}")
   list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}")
   include(ExternalProject)
-  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball")
-  set(KOKKOS_MD5 "4c84698917c93a18985b311bb6caf84f" CACHE STRING "MD5 checksum of KOKKOS tarball")
+  set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.5.00.tar.gz" CACHE STRING "URL for KOKKOS tarball")
+  set(KOKKOS_MD5 "079323d973ae0e1c38c0a54a150c674e" CACHE STRING "MD5 checksum of KOKKOS tarball")
   mark_as_advanced(KOKKOS_URL)
   mark_as_advanced(KOKKOS_MD5)
   ExternalProject_Add(kokkos_build
@@ -60,7 +60,7 @@ if(DOWNLOAD_KOKKOS)
   target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS)
   add_dependencies(LAMMPS::KOKKOS kokkos_build)
 elseif(EXTERNAL_KOKKOS)
-  find_package(Kokkos 3.4.01 REQUIRED CONFIG)
+  find_package(Kokkos 3.5.00 REQUIRED CONFIG)
   target_link_libraries(lammps PRIVATE Kokkos::kokkos)
   target_link_libraries(lmp PRIVATE Kokkos::kokkos)
 else()
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 2157fe86c8..994fdf2e5c 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -560,11 +560,26 @@ They must be specified in uppercase.
    *  - VEGA908
       - GPU
       - AMD GPU MI100 GFX908
-   *  - INTEL_GEN
+   *  - VEGA90A
       - GPU
-      - Intel GPUs Gen9+
+      - AMD GPU
+   *  - INTEL_DG1
+      - GPU
+      - Intel Iris XeMAX GPU
+   *  - INTEL_GEN9
+      - GPU
+      - Intel GPU Gen9
+   *  - INTEL_GEN11
+      - GPU
+      - Intel GPU Gen11
+   *  - INTEL_GEN12LP
+      - GPU
+      - Intel GPU Gen12LP
+   *  - INTEL_XEHP
+      - GPU
+      - Intel GPUs Xe-HP
 
-This list was last updated for version 3.4.1 of the Kokkos library.
+This list was last updated for version 3.5.0 of the Kokkos library.
 
 .. tabs::
 
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 7bb6de4cd9..2e779791dd 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,165 @@
 # Change Log
 
+## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
+
+### Features:
+
+- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
+- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
+- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
+- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
+- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
+- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
+- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
+- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
+- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
+- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
+- Significant SYCL enhancements (see below)
+
+### Deprecations:
+
+- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
+- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
+- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
+- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
+- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
+- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
+
+
+### Backends and Archs Enhancements:
+
+- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
+- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
+- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
+- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
+- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
+- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
+- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
+- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
+
+#### SYCL:
+- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
+- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
+- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
+- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
+- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
+- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
+- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
+- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
+- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
+- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
+- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
+- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
+- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
+
+#### CUDA:
+- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
+- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
+- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
+- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
+- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
+- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
+- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
+- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
+- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
+
+#### HIP:
+- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
+- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
+- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
+
+#### Serial:
+- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
+
+#### OpenMPTarget:
+- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
+- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
+- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
+- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
+
+### Implemented enhancements BuildSystem
+
+#### Important BuildSystem Updates:
+- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
+- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
+
+#### Other Improvements:
+- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
+- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
+- Adds support for -time=<file> and -time <file> in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
+- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
+- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
+- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
+- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
+- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
+- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
+- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
+- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
+- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
+- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
+- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
+- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
+- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
+- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
+- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
+
+### Other Changes:
+
+#### Tool Enhancements:
+
+- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
+- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
+- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
+
+
+#### Miscellaneous:
+
+- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
+- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
+- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
+- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
+- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
+
+### Incompatibilities:
+
+- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
+
+### Bug Fixes:
+- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
+- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
+- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
+- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
+- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
+- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
+- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
+- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
+- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
+- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
+- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
+- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
+- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
+- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
+- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
+- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
+- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
+- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
+- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
+- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
+- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
+- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
+- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
+- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
+- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
+- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
+- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
+- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
+- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
+- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
+- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
+- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
+- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
+
 ## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
 
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index 9452027d8e..1b6753f983 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -111,8 +111,8 @@ ENDIF()
 
 
 set(Kokkos_VERSION_MAJOR 3)
-set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_MINOR 5)
+set(Kokkos_VERSION_PATCH 00)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
@@ -210,7 +210,12 @@ IF (KOKKOS_HAS_TRILINOS)
   # which needs another workaround.
   SET(KOKKOS_COMPILE_OPTIONS_TMP)
   FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
-    LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\")
+    STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
+    IF(OPTION_HAS_WHITESPACE EQUAL -1)
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
+    ELSE()
+      LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
+    ENDIF()
   ENDFOREACH()
   STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
   LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index 2a984eefb6..c22cc547f6 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -11,20 +11,21 @@ CXXFLAGS += $(SHFLAGS)
 endif
 
 KOKKOS_VERSION_MAJOR = 3
-KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_MINOR = 5
+KOKKOS_VERSION_PATCH = 00
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
-# Options: Cuda,HIP,OpenMP,Pthread,Serial
+# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Pthread,Serial
 KOKKOS_DEVICES ?= "OpenMP"
 #KOKKOS_DEVICES ?= "Pthread"
-# Options: 
+# Options:
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
 # ARM:      ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
 # IBM:      BGQ,Power7,Power8,Power9
-# AMD-GPUS: Vega900,Vega906,Vega908
+# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
+# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP
 KOKKOS_ARCH ?= ""
 # Options: yes,no
 KOKKOS_DEBUG ?= "no"
@@ -32,7 +33,7 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++14"
-# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align
+# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings
 KOKKOS_OPTIONS ?= ""
 KOKKOS_CMAKE ?= "no"
 KOKKOS_TRIBITS ?= "no"
@@ -80,7 +81,7 @@ KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),exper
 
 # Check for advanced settings.
 KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
-KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
+KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
 KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
 KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
@@ -92,6 +93,9 @@ KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
 KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
 KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
+KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
+KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
+KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)
 
 KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
 
@@ -112,6 +116,7 @@ endif
 # Check for other Execution Spaces.
 KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
 KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
+KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
 KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
 
 KOKKOS_DEVICELIST =
@@ -133,11 +138,18 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   KOKKOS_DEVICELIST += HIP
 endif
+KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+                                                  + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  KOKKOS_DEVICELIST += SYCL
+  ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
+    $(error SYCL backend requires C++17 or newer)
+  endif
+
+endif
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   KOKKOS_DEVICELIST += OPENMPTARGET
-  KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX20) \
-                                                    + $(KOKKOS_INTERNAL_ENABLE_CXX2A))
   ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
     $(error OpenMPTarget backend requires C++17 or newer)
   endif
@@ -168,6 +180,8 @@ KOKKOS_INTERNAL_COMPILER_XL          := $(strip $(shell $(CXX) -qversion       2
 KOKKOS_INTERNAL_COMPILER_CRAY        := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
 KOKKOS_INTERNAL_COMPILER_NVCC        := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
 KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
+KOKKOS_INTERNAL_COMPILER_CRAY_CLANG  := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
+KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
 KOKKOS_INTERNAL_COMPILER_GCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
@@ -247,7 +261,11 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
   KOKKOS_INTERNAL_OPENMP_FLAG := -mp
 else
   ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
+    KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+    else
     KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
+    endif
   else
     ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
       KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
@@ -259,7 +277,11 @@ else
           # OpenMP is turned on by default in Cray compiler environment.
           KOKKOS_INTERNAL_OPENMP_FLAG :=
         else
-          KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
+          else
+            KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+          endif
         endif
       endif
     endif
@@ -317,6 +339,13 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
 KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
 KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
 
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
+KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
+
 # NVIDIA based.
 NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
 KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
@@ -384,20 +413,25 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
 KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
 KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
 KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
-KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
+    KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
+  endif
+endif
 KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
 KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
 KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
+KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
 
 # Any AVX?
 KOKKOS_INTERNAL_USE_ARCH_SSE42      := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
 KOKKOS_INTERNAL_USE_ARCH_AVX        := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
-KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ARCH_AVX2       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ARCH_AVX512MIC  := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
 KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Decide what ISA level we are able to support.
-KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
+KOKKOS_INTERNAL_USE_ISA_X86_64    := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
 KOKKOS_INTERNAL_USE_ISA_KNC       := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC))
 KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9))
 KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7))
@@ -406,7 +440,7 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW
 KOKKOS_INTERNAL_USE_TM            := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX))
 
 # Incompatible flags?
-KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc )
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
 KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
 
 ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
@@ -442,6 +476,10 @@ KOKKOS_LINK_FLAGS =
 KOKKOS_SRC =
 KOKKOS_HEADERS =
 
+#ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
+  KOKKOS_LIBS += -latomic
+#endif
+
 # Generating the KokkosCore_config.h file.
 
 KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
@@ -478,6 +516,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
   tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
   ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
@@ -533,6 +575,12 @@ endif
 
 #only add the c++ standard flags if this is not CMake
 tmp := $(call kokkos_append_header,"/* General Settings */")
+ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
 ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
   KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
@@ -635,8 +683,10 @@ endif
 
 tmp := $(call kokkos_append_header,"/* Optimization Settings */")
 
-ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
+ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
+  # deprecated
   tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
 endif
 
 tmp := $(call kokkos_append_header,"/* Cuda Settings */")
@@ -1166,6 +1216,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
     tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
     KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx908
   endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HIP 90A")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
+    KOKKOS_INTERNAL_HIP_ARCH_FLAG := --amdgpu-target=gfx90a
+  endif
 
 
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
@@ -1184,6 +1239,52 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
   endif
 endif
 
+# Figure out the architecture flag for SYCL.
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+  # Lets start with adding architecture defines
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
+    tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
+    KOKKOS_INTERNAL_SYCL_ARCH_FLAG := -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
+  endif
+
+  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
+  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
+
+  KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda
+  KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+  KOKKOS_LDFLAGS+=-fsycl
+  KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_SYCL_ARCH_FLAG)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+  tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
+endif
 
 KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
 
@@ -1196,56 +1297,62 @@ endif
 ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
   tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
 
-# Functions for generating config header file
-kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
-kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
-kokkos_append_config_header = $(shell echo $1 >> $2))
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
-tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
-tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
-tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
-ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
-   ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-   else
-   endif
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
-ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
-   tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
-   tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
-endif
+  # Functions for generating config header file
+  kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
+  kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
+  kokkos_append_config_header = $(shell echo $1 >> $2))
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
+  tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
+  tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
+  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_CUDA.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_CUDA.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_Cuda.hpp>","KokkosCore_Config_SetupBackend.hpp")
+    ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+    else
+    endif
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMPTARGET.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMPTARGET.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SYCL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SYCL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_SYCL.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HIP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HIP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <setup/Kokkos_Setup_HIP.hpp>","KokkosCore_Config_SetupBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_OPENMP.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_OPENMP.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_THREADS.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_THREADS.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HPX.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HPX.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_SERIAL.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_SERIAL.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
+  ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
+    tmp := $(call kokkos_append_config_header,"$H""include <fwd/Kokkos_Fwd_HBWSpace.hpp>","KokkosCore_Config_FwdBackend.hpp")
+    tmp := $(call kokkos_append_config_header,"$H""include <decl/Kokkos_Declare_HBWSpace.hpp>","KokkosCore_Config_DeclareBackend.hpp")
+  endif
 endif
+
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
 KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
@@ -1257,6 +1364,9 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
 
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
   KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+  ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
+    KOKKOS_SRC += $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+  endif
   KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
   ifneq ($(CUDA_PATH),)
     KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index cf9fc24242..93854d0cf1 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -48,6 +48,17 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/desul/src/Lock_Array_CUDA.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
+Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
+Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
+Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md
index d55ef2caac..673f462712 100644
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@@ -7,7 +7,7 @@ applications targeting all major HPC platforms. For that purpose it provides
 abstractions for both parallel execution of code and data management.
 Kokkos is designed to target complex node architectures with N-level memory
 hierarchies and multiple types of execution resources. It currently can use
-CUDA, HPX, OpenMP and Pthreads as backend programming models with several other
+CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
 backends in development.
 
 Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem,
@@ -16,29 +16,19 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 # Learning about Kokkos
 
-A programming guide can be found on the Wiki, the API reference is under development.
+The best way to start learning about Kokkos is going through the Kokkos Lectures.
+They are online available at https://kokkos.link/the-lectures and contain a mix
+of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem
+capabilities.
+
+A programming guide and API reference can be found on the Wiki
+(https://github.com/kokkos/kokkos/wiki).
 
 For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
 
-A separate repository with extensive tutorial material can be found under
-https://github.com/kokkos/kokkos-tutorials.
-
-Furthermore, the 'example/tutorial' directory provides step by step tutorial
-examples which explain many of the features of Kokkos. They work with
-simple Makefiles. To build with g++ and OpenMP simply type 'make'
-in the 'example/tutorial' directory. This will build all examples in the
-subfolders. To change the build options refer to the Programming Guide
-in the compilation section.
-
-To learn more about Kokkos consider watching one of our presentations:
-* GTC 2015:
-  - http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
-  - http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
-
-
 # Contributing to Kokkos
 
 We are open and try to encourage contributions from external developers.
@@ -53,57 +43,40 @@ For specifics see the LICENSE file contained in the repository or distribution.
 
 # Requirements
 
-### Primary tested compilers on X86 are:
-* GCC 5.3.0
-* GCC 5.4.0
-* GCC 5.5.0
-* GCC 6.1.0
-* GCC 7.2.0
-* GCC 7.3.0
-* GCC 8.1.0
-* Intel 17.0.1
-* Intel 17.4.196
-* Intel 18.2.128
-* Clang 4.0.0
-* Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
-* Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
-* Clang 8.0.0 for CUDA (CUDA Toolkit 9.2)
-* PGI 18.7
-* NVCC 9.1 for CUDA (with gcc 6.1.0)
-* NVCC 9.2 for CUDA (with gcc 7.2.0)
-* NVCC 10.0 for CUDA (with gcc 7.4.0)
-* NVCC 10.1 for CUDA (with gcc 7.4.0)
-* NVCC 11.0 for CUDA (with gcc 8.4.0)
+### Minimum Compiler Versions
 
-### Primary tested compilers on Power 8 are:
-* GCC 6.4.0 (OpenMP,Serial)
-* GCC 7.2.0 (OpenMP,Serial)
-* IBM XL 16.1.0 (OpenMP, Serial)
-* NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)
+Generally Kokkos should work with all compiler versions newer than the minimum.
+However as in all sufficiently complex enough code, we have to work around compiler
+bugs with almost all compilers. So compiler versions we don't test may have issues
+we are unaware off.
 
-### Primary tested compilers on Intel KNL are:
-* Intel 17.2.174 (with gcc 6.2.0 and 6.4.0)
-* Intel 18.2.199 (with gcc 6.2.0 and 6.4.0)
+* GCC: 5.3.0
+* Clang: 4.0.0
+* Intel: 17.0.1
+* NVCC: 9.2.88
+* NVC++: 21.5
+* ROCM: 4.3
+* MSVC: 19.29
+* IBM XL: 16.1.1
+* Fujitsu: 4.5.0
+* ARM/Clang 20.1
 
-### Primary tested compilers on ARM (Cavium ThunderX2)
-* GCC 7.2.0
-* ARM/Clang 18.4.0
+### Primary Tested Compilers
 
-### Other compilers working:
-* X86:
-    * Cygwin 2.1.0 64bit with gcc 4.9.3
-    * GCC 8.1.0 (not warning free)
-
-### Known non-working combinations:
-* Power8:
-    * Pthreads backend
-* ARM
-    * Pthreads backend
+* GCC: 5.3.0, 6.1.0, 7.3.0, 8.3, 9.2, 10.0
+* NVCC: 9.2.88, 10.1, 11.0
+* Clang: 8.0.0, 9.0.0, 10.0.0, 12.0.0
+* Intel 17.4, 18.1, 19.5
+* MSVC: 19.29
+* ARM/Clang: 20.1
+* IBM XL: 16.1.1
+* ROCM: 4.3.0
 
 ### Build system:
-* CMake >= 3.10: required
-* CMake >= 3.13: recommended
+
+* CMake >= 3.16: required
 * CMake >= 3.18: Fortran linkage. This does not affect most mixed Fortran/Kokkos builds. See [build issues](BUILD.md#KnownIssues).
+* CMake >= 3.21.1 for NVC++
 
 Primary tested compiler are passing in release mode
 with warnings as errors. They also are tested with a comprehensive set of
@@ -153,7 +126,6 @@ cmake $srcdir \
   -DCMAKE_INSTALL_PREFIX=$path_to_install \
   -DKokkos_ENABLE_OPENMP=On \
   -DKokkos_ARCH_HSW=On \
-  -DKokkos_ENABLE_HWLOC=On \
   -DKokkos_HWLOC_DIR=$path_to_hwloc
 ````
 then simply type `make install`. The Kokkos CMake package will then be installed in `$path_to_install` to be used by downstream packages.
@@ -212,23 +184,8 @@ where `...` is the unique spec identifying the particular Kokkos configuration a
 Some more details can found in the Kokkos spack [documentation](Spack.md) or the Spack [website](https://spack.readthedocs.io/en/latest).
 
 ## Raw Makefile
-A bash script is provided to generate raw makefiles.
-To install Kokkos as a library create a build directory and run the following
-````bash
-> $KOKKOS_PATH/generate_makefile.bash --prefix=$path_to_install
-````
-Once the Makefile is generated, run:
-````bash
-> make kokkoslib
-> make install
-````
-To additionally run the unit tests:
-````bash
-> make build-test
-> make test
-````
-Run `generate_makefile.bash --help` for more detailed options such as
-changing the device type for which to build.
+
+Raw Makefiles are only supported via inline builds. See below.
 
 ## Inline Builds vs. Installed Package
 For individual projects, it may be preferable to build Kokkos inline rather than link to an installed package.
@@ -268,6 +225,35 @@ more than a single GPU is used by a single process.
 
 If you publish work which mentions Kokkos, please cite the following paper:
 
+````BibTex
+@ARTICLE{9485033,
+  author={Trott, Christian R. and Lebrun-Grandié, Damien and Arndt, Daniel and Ciesko, Jan and Dang, Vinh and Ellingwood, Nathan and Gayatri, Rahulkumar and Harvey, Evan and Hollman, Daisy S. and Ibanez, Dan and Liber, Nevin and Madsen, Jonathan and Miles, Jeff and Poliakoff, David and Powell, Amy and Rajamanickam, Sivasankaran and Simberg, Mikael and Sunderland, Dan and Turcksin, Bruno and Wilke, Jeremiah},
+  journal={IEEE Transactions on Parallel and Distributed Systems},
+  title={Kokkos 3: Programming Model Extensions for the Exascale Era},
+  year={2022},
+  volume={33},
+  number={4},
+  pages={805-817},
+  doi={10.1109/TPDS.2021.3097283}}
+````
+
+If you use more than one Kokkos EcoSystem package, please also cite:
+
+````BibTex
+@ARTICLE{9502936,
+  author={Trott, Christian and Berger-Vergiat, Luc and Poliakoff, David and Rajamanickam, Sivasankaran and Lebrun-Grandie, Damien and Madsen, Jonathan and Al Awar, Nader and Gligoric, Milos and Shipman, Galen and Womeldorff, Geoff},
+  journal={Computing in Science   Engineering},
+  title={The Kokkos EcoSystem: Comprehensive Performance Portability for High Performance Computing},
+  year={2021},
+  volume={23},
+  number={5},
+  pages={10-18},
+  doi={10.1109/MCSE.2021.3098509}}
+````
+
+
+And if you feel generous: feel free to cite the original Kokkos paper which describes most of the basic Kokkos concepts:
+
 ````BibTeX
 @article{CarterEdwards20143202,
   title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt
index 4df76a1dbb..eb54db8a55 100644
--- a/lib/kokkos/algorithms/CMakeLists.txt
+++ b/lib/kokkos/algorithms/CMakeLists.txt
@@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
 IF (NOT Kokkos_INSTALL_TESTING)
   ADD_SUBDIRECTORY(src)
 ENDIF()
-IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
-        AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
-             KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
+IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
   KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
 ENDIF()
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 55ce19971f..46b8ab87fa 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
   }
 };
 
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+template <class Generator>
+struct rand<Generator, Kokkos::Experimental::half_t> {
+  using half = Kokkos::Experimental::half_t;
+  KOKKOS_INLINE_FUNCTION
+  static half max() { return half(1.0); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen) { return half(gen.frand()); }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& range) {
+    return half(gen.frand(float(range)));
+  }
+  KOKKOS_INLINE_FUNCTION
+  static half draw(Generator& gen, const half& start, const half& end) {
+    return half(gen.frand(float(start), float(end)));
+  }
+};
+#endif  // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+
 template <class Generator>
 struct rand<Generator, float> {
   KOKKOS_INLINE_FUNCTION
@@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
 
 template <class ExecutionSpace>
 struct Random_UniqueIndex {
-  using locks_view_type = View<int*, ExecutionSpace>;
+  using locks_view_type = View<int**, ExecutionSpace>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type) {
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@@ -615,7 +634,7 @@ struct Random_UniqueIndex {
 #ifdef KOKKOS_ENABLE_CUDA
 template <>
 struct Random_UniqueIndex<Kokkos::Cuda> {
-  using locks_view_type = View<int*, Kokkos::Cuda>;
+  using locks_view_type = View<int**, Kokkos::Cuda>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __CUDA_ARCH__
@@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
 #ifdef KOKKOS_ENABLE_HIP
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
-  using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
+  using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
 #ifdef __HIP_DEVICE_COMPILE__
@@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
                  blockDim.x * blockDim.y * blockDim.z +
              i_offset) %
             locks_.extent(0);
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i += blockDim.x * blockDim.y * blockDim.z;
       if (i >= static_cast<int>(locks_.extent(0))) {
         i = i_offset;
@@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
 #ifdef KOKKOS_ENABLE_SYCL
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
-  using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
+  using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks_) {
-#ifdef KOKKOS_ARCH_INTEL_GEN
+#ifdef KOKKOS_ARCH_INTEL_GPU
     int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
 #else
     int i = 0;
 #endif
-    while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
       i = (i + 1) % static_cast<int>(locks_.extent(0));
     }
     return i;
@@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
 template <>
 struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
-  using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
+  using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
   KOKKOS_FUNCTION
   static int get_state_idx(const locks_view_type& locks) {
     const int team_size = omp_get_num_threads();
     int i               = omp_get_team_num() * team_size + omp_get_thread_num();
     const int lock_size = locks.extent_int(0);
 
-    while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
+    while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
       i = (i + 1) % lock_size;
     }
     return i;
@@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift64_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using state_data_type = View<uint64_t*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using state_data_type = View<uint64_t**, DeviceType>;
   locks_type locks_;
   state_data_type state_;
   int num_states_;
+  int padding_;
 
  public:
   using generator_type = Random_XorShift64<DeviceType>;
   using device_type    = DeviceType;
 
   KOKKOS_INLINE_FUNCTION
-  Random_XorShift64_Pool() { num_states_ = 0; }
+  Random_XorShift64_Pool() {
+    num_states_ = 0;
+    padding_    = 0;
+  }
   Random_XorShift64_Pool(uint64_t seed) {
     num_states_ = 0;
 
@@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
     locks_      = src.locks_;
     state_      = src.state_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
-
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
 
-    locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
-    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
+    state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
+                             padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
         gen(seed, 0);
     for (int i = 0; i < 17; i++) gen.rand();
     for (int i = 0; i < num_states_; i++) {
-      int n1     = gen.rand();
-      int n2     = gen.rand();
-      int n3     = gen.rand();
-      int n4     = gen.rand();
-      h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
-                   (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
-                   (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
-                   (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
-      h_lock(i) = 0;
+      int n1        = gen.rand();
+      int n2        = gen.rand();
+      int n3        = gen.rand();
+      int n4        = gen.rand();
+      h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
+                      (((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
+                      (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
+                      (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
   Random_XorShift64<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift64<DeviceType>(state_(i), i);
+    return Random_XorShift64<DeviceType>(state_(i, 0), i);
   }
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift64<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
+    return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift64<DeviceType>& state) const {
-    state_(state.state_idx_) = state.state_;
-    locks_(state.state_idx_) = 0;
+    state_(state.state_idx_, 0) = state.state_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
@@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
 class Random_XorShift1024_Pool {
  private:
   using execution_space = typename DeviceType::execution_space;
-  using locks_type      = View<int*, execution_space>;
-  using int_view_type   = View<int*, DeviceType>;
+  using locks_type      = View<int**, execution_space>;
+  using int_view_type   = View<int**, DeviceType>;
   using state_data_type = View<uint64_t * [16], DeviceType>;
 
   locks_type locks_;
   state_data_type state_;
   int_view_type p_;
   int num_states_;
+  int padding_;
   friend class Random_XorShift1024<DeviceType>;
 
  public:
@@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
     state_      = src.state_;
     p_          = src.p_;
     num_states_ = src.num_states_;
+    padding_    = src.padding_;
     return *this;
   }
 
   inline void init(uint64_t seed, int num_states) {
     if (seed == 0) seed = uint64_t(1318319);
+    // I only want to pad on CPU like archs (less than 1000 threads). 64 is a
+    // magic number, or random number I just wanted something not too large and
+    // not too small. 64 sounded fine.
+    padding_    = num_states < 1000 ? 64 : 1;
     num_states_ = num_states;
-    locks_      = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
+    locks_ =
+        locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
     state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
-    p_     = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
+    p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
 
     typename state_data_type::HostMirror h_state = create_mirror_view(state_);
     typename locks_type::HostMirror h_lock       = create_mirror_view(locks_);
@@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
                         (((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
                         (((static_cast<uint64_t>(n4)) & 0xffff) << 48);
       }
-      h_p(i)    = 0;
-      h_lock(i) = 0;
+      h_p(i, 0)    = 0;
+      h_lock(i, 0) = 0;
     }
     deep_copy(state_, h_state);
     deep_copy(locks_, h_lock);
@@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
   Random_XorShift1024<DeviceType> get_state() const {
     const int i =
         Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
-    return Random_XorShift1024<DeviceType>(state_, p_(i), i);
+    return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
   };
 
   // NOTE: state_idx MUST be unique and less than num_states
   KOKKOS_INLINE_FUNCTION
   Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
-    return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
+    return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
   }
 
   KOKKOS_INLINE_FUNCTION
   void free_state(const Random_XorShift1024<DeviceType>& state) const {
     for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
-    p_(state.state_idx_)     = state.p_;
-    locks_(state.state_idx_) = 0;
+    p_(state.state_idx_, 0)     = state.p_;
+    locks_(state.state_idx_, 0) = 0;
   }
 };
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
index d17c02776f..9c2e8b978b 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -206,8 +206,10 @@ class BinSort {
   //----------------------------------------
   // Constructor: takes the keys, the binning_operator and optionally whether to
   // sort within bins (default false)
-  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
-          BinSortOp bin_op_, bool sort_within_bins_ = false)
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          int range_begin_, int range_end_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
       : keys(keys_),
         keys_rnd(keys_),
         bin_op(bin_op_),
@@ -222,50 +224,63 @@ class BinSort {
         "Kokkos::SortImpl::BinSortFunctor::bin_count", bin_op.max_bins());
     bin_count_const = bin_count_atomic;
     bin_offsets =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                                "Kokkos::SortImpl::BinSortFunctor::bin_offsets"),
                     bin_op.max_bins());
     sort_order =
-        offset_type(view_alloc(WithoutInitializing,
+        offset_type(view_alloc(exec, WithoutInitializing,
                                "Kokkos::SortImpl::BinSortFunctor::sort_order"),
                     range_end - range_begin);
   }
 
+  BinSort(const_key_view_type keys_, int range_begin_, int range_end_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(execution_space{}, keys_, range_begin_, range_end_, bin_op_,
+                sort_within_bins_) {}
+
+  template <typename ExecutionSpace>
+  BinSort(const ExecutionSpace& exec, const_key_view_type keys_,
+          BinSortOp bin_op_, bool sort_within_bins_ = false)
+      : BinSort(exec, keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+
   BinSort(const_key_view_type keys_, BinSortOp bin_op_,
           bool sort_within_bins_ = false)
-      : BinSort(keys_, 0, keys_.extent(0), bin_op_, sort_within_bins_) {}
+      : BinSort(execution_space{}, keys_, bin_op_, sort_within_bins_) {}
 
   //----------------------------------------
   // Create the permutation vector, the bin_offset array and the bin_count
   // array. Can be called again if keys changed
-  void create_permute_vector() {
+  template <class ExecutionSpace = execution_space>
+  void create_permute_vector(const ExecutionSpace& exec = execution_space{}) {
     const size_t len = range_end - range_begin;
     Kokkos::parallel_for(
         "Kokkos::Sort::BinCount",
-        Kokkos::RangePolicy<execution_space, bin_count_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_count_tag>(exec, 0, len),
+        *this);
     Kokkos::parallel_scan("Kokkos::Sort::BinOffset",
-                          Kokkos::RangePolicy<execution_space, bin_offset_tag>(
-                              0, bin_op.max_bins()),
+                          Kokkos::RangePolicy<ExecutionSpace, bin_offset_tag>(
+                              exec, 0, bin_op.max_bins()),
                           *this);
 
-    Kokkos::deep_copy(bin_count_atomic, 0);
+    Kokkos::deep_copy(exec, bin_count_atomic, 0);
     Kokkos::parallel_for(
         "Kokkos::Sort::BinBinning",
-        Kokkos::RangePolicy<execution_space, bin_binning_tag>(0, len), *this);
+        Kokkos::RangePolicy<ExecutionSpace, bin_binning_tag>(exec, 0, len),
+        *this);
 
     if (sort_within_bins)
       Kokkos::parallel_for(
           "Kokkos::Sort::BinSort",
-          Kokkos::RangePolicy<execution_space, bin_sort_bins_tag>(
-              0, bin_op.max_bins()),
+          Kokkos::RangePolicy<ExecutionSpace, bin_sort_bins_tag>(
+              exec, 0, bin_op.max_bins()),
           *this);
   }
 
   // Sort a subset of a view with respect to the first dimension using the
   // permutation array
-  template <class ValuesViewType>
-  void sort(ValuesViewType const& values, int values_range_begin,
-            int values_range_end) const {
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(const ExecutionSpace& exec, ValuesViewType const& values,
+            int values_range_begin, int values_range_end) const {
     using scratch_view_type =
         Kokkos::View<typename ValuesViewType::data_type,
                      typename ValuesViewType::array_layout,
@@ -279,7 +294,7 @@ class BinSort {
     }
 
     scratch_view_type sorted_values(
-        view_alloc(WithoutInitializing,
+        view_alloc(exec, WithoutInitializing,
                    "Kokkos::SortImpl::BinSortFunctor::sorted_values"),
         values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
         values.rank_dynamic > 1 ? values.extent(1)
@@ -308,7 +323,7 @@ class BinSort {
                   values_range_begin - range_begin);
 
       parallel_for("Kokkos::Sort::CopyPermute",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
     }
 
     {
@@ -316,10 +331,23 @@ class BinSort {
           values, range_begin, sorted_values);
 
       parallel_for("Kokkos::Sort::Copy",
-                   Kokkos::RangePolicy<execution_space>(0, len), functor);
+                   Kokkos::RangePolicy<ExecutionSpace>(exec, 0, len), functor);
     }
+  }
 
-    execution_space().fence();
+  // Sort a subset of a view with respect to the first dimension using the
+  // permutation array
+  template <class ValuesViewType>
+  void sort(ValuesViewType const& values, int values_range_begin,
+            int values_range_end) const {
+    execution_space exec;
+    sort(exec, values, values_range_begin, values_range_end);
+    exec.fence("Kokkos::Sort: fence after sorting");
+  }
+
+  template <class ExecutionSpace, class ValuesViewType>
+  void sort(ExecutionSpace const& exec, ValuesViewType const& values) const {
+    this->sort(exec, values, 0, /*values.extent(0)*/ range_end - range_begin);
   }
 
   template <class ValuesViewType>
@@ -485,17 +513,19 @@ struct BinOp3D {
 
 namespace Impl {
 
-template <class ViewType>
-bool try_std_sort(ViewType view) {
+template <class ViewType, class ExecutionSpace>
+bool try_std_sort(ViewType view, const ExecutionSpace& exec) {
   bool possible    = true;
   size_t stride[8] = {view.stride_0(), view.stride_1(), view.stride_2(),
                       view.stride_3(), view.stride_4(), view.stride_5(),
                       view.stride_6(), view.stride_7()};
   possible         = possible &&
-             std::is_same<typename ViewType::memory_space, HostSpace>::value;
+             SpaceAccessibility<HostSpace,
+                                typename ViewType::memory_space>::accessible;
   possible = possible && (ViewType::Rank == 1);
   possible = possible && (stride[0] == 1);
   if (possible) {
+    exec.fence("Kokkos::sort: Fence before sorting on the host");
     std::sort(view.data(), view.data() + view.extent(0));
   }
   return possible;
@@ -518,10 +548,12 @@ struct min_max_functor {
 
 }  // namespace Impl
 
-template <class ViewType>
-void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType const& view,
+    bool const always_use_kokkos_sort = false) {
   if (!always_use_kokkos_sort) {
-    if (Impl::try_std_sort(view)) return;
+    if (Impl::try_std_sort(view, exec)) return;
   }
   using CompType = BinOp1D<ViewType>;
 
@@ -529,34 +561,50 @@ void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
   Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
   parallel_reduce("Kokkos::Sort::FindExtent",
                   Kokkos::RangePolicy<typename ViewType::execution_space>(
-                      0, view.extent(0)),
+                      exec, 0, view.extent(0)),
                   Impl::min_max_functor<ViewType>(view), reducer);
   if (result.min_val == result.max_val) return;
   BinSort<ViewType, CompType> bin_sort(
       view, CompType(view.extent(0) / 2, result.min_val, result.max_val), true);
-  bin_sort.create_permute_vector();
-  bin_sort.sort(view);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view);
 }
 
 template <class ViewType>
-void sort(ViewType view, size_t const begin, size_t const end) {
+void sort(ViewType const& view, bool const always_use_kokkos_sort = false) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, always_use_kokkos_sort);
+  exec.fence("Kokkos::Sort: fence after sorting");
+}
+
+template <class ExecutionSpace, class ViewType>
+std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value> sort(
+    const ExecutionSpace& exec, ViewType view, size_t const begin,
+    size_t const end) {
   using range_policy = Kokkos::RangePolicy<typename ViewType::execution_space>;
   using CompType     = BinOp1D<ViewType>;
 
   Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
   Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
 
-  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(begin, end),
+  parallel_reduce("Kokkos::Sort::FindExtent", range_policy(exec, begin, end),
                   Impl::min_max_functor<ViewType>(view), reducer);
 
   if (result.min_val == result.max_val) return;
 
   BinSort<ViewType, CompType> bin_sort(
-      view, begin, end,
+      exec, view, begin, end,
       CompType((end - begin) / 2, result.min_val, result.max_val), true);
 
-  bin_sort.create_permute_vector();
-  bin_sort.sort(view, begin, end);
+  bin_sort.create_permute_vector(exec);
+  bin_sort.sort(exec, view, begin, end);
+}
+
+template <class ViewType>
+void sort(ViewType view, size_t const begin, size_t const end) {
+  typename ViewType::execution_space exec;
+  sort(exec, view, begin, end);
+  exec.fence("Kokkos::Sort: fence after sorting");
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
index c37e779c99..3dffce7df4 100644
--- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -47,7 +47,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <cmath>
@@ -198,11 +198,50 @@ struct test_random_functor {
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
       const uint64_t ind3_3d =
           static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
-
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+//
+// 17.0.4 for 64bit Random works with 1/1/1/2/1
+// 17.0.4 for 1024bit Random works with 1/1/1/1/1
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind1_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind2_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_1d(ind3_1d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
+        asm volatile("nop\n");
+      asm volatile("nop\n");
+#endif
+#endif
       atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
     }
     rand_pool.free_state(rand_gen);
   }
@@ -338,9 +377,11 @@ struct test_random_scalar {
       using functor_type =
           test_histogram1d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
-
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
+      double mean_eps_expect       = 0.0001;
+      double variance_eps_expect   = 0.07;
+      double covariance_eps_expect = 0.06;
+      double tolerance             = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect           = 1.0 * num_draws * 3 / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@@ -349,11 +390,26 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
-      pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
-      pass_hist1d_var =
-          ((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
-      pass_hist1d_covar =
-          ((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        mean_eps_expect       = 0.0003;
+        variance_eps_expect   = 1.0;
+        covariance_eps_expect = 5.0e4;
+      }
+#endif
+
+      pass_hist1d_mean =
+          ((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
+                                                                          : 0;
+      pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
+                         (variance_eps_expect > variance_eps))
+                            ? 1
+                            : 0;
+      pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
+                           (covariance_eps_expect > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
            << (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@@ -371,8 +427,9 @@ struct test_random_scalar {
           test_histogram3d_functor<typename RandomGenerator::device_type>;
       parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
 
-      double tolerance   = 6 * std::sqrt(1.0 / HIST_DIM1D);
-      double mean_expect = 1.0 * num_draws / HIST_DIM1D;
+      double variance_factor = 1.2;
+      double tolerance       = 6 * std::sqrt(1.0 / HIST_DIM1D);
+      double mean_expect     = 1.0 * num_draws / HIST_DIM1D;
       double variance_expect =
           1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
       double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@@ -381,15 +438,23 @@ struct test_random_scalar {
           variance_expect / (result.variance / HIST_DIM1D) - 1.0;
       double covariance_eps =
           (result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
+
+#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+      if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
+        variance_factor = 7;
+      }
+#endif
+
       pass_hist3d_mean =
           ((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
-      pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
-                         (1.2 * tolerance > variance_eps))
+      pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
+                         (variance_factor * tolerance > variance_eps))
                             ? 1
                             : 0;
-      pass_hist3d_covar =
-          ((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
-                                                                          : 0;
+      pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
+                           (variance_factor * tolerance > covariance_eps))
+                              ? 1
+                              : 0;
 
       cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
            << result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
   deep_copy(density_1d, 0);
   deep_copy(density_3d, 0);
 
+  cout << "Test Scalar=half" << endl;
+  test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
+      density_1d, density_3d, pool, num_draws);
+  ASSERT_EQ(test_half.pass_mean, 1);
+  ASSERT_EQ(test_half.pass_var, 1);
+  ASSERT_EQ(test_half.pass_covar, 1);
+  ASSERT_EQ(test_half.pass_hist1d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist1d_var, 1);
+  ASSERT_EQ(test_half.pass_hist1d_covar, 1);
+  ASSERT_EQ(test_half.pass_hist3d_mean, 1);
+  ASSERT_EQ(test_half.pass_hist3d_var, 1);
+  ASSERT_EQ(test_half.pass_hist3d_covar, 1);
+  deep_copy(density_1d, 0);
+  deep_copy(density_3d, 0);
+
   cout << "Test Scalar=float" << endl;
   test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
                                                         pool, num_draws);
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
index 9c6308c843..de1e6b3c31 100644
--- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -135,8 +135,9 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   KeyViewType keys("Keys", n);
 
   // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys, KeyType(1));
-  Kokkos::sort(keys, force_kokkos);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys, KeyType(1));
+  Kokkos::sort(exec, keys, force_kokkos);
 
   Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
   Kokkos::fill_random(keys, g,
@@ -147,13 +148,16 @@ void test_1D_sort_impl(unsigned int n, bool force_kokkos) {
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_before);
 
-  Kokkos::sort(keys, force_kokkos);
+  Kokkos::sort(exec, keys, force_kokkos);
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys), sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys),
+                          sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -177,8 +181,10 @@ void test_3D_sort_impl(unsigned int n) {
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_before);
+  ExecutionSpace exec;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_before);
 
   int bin_1d = 1;
   while (bin_1d * bin_1d * bin_1d * 4 < (int)keys.extent(0)) bin_1d *= 2;
@@ -189,15 +195,17 @@ void test_3D_sort_impl(unsigned int n) {
   using BinOp = Kokkos::BinOp3D<KeyViewType>;
   BinOp bin_op(bin_max, min, max);
   Kokkos::BinSort<KeyViewType, BinOp> Sorter(keys, bin_op, false);
-  Sorter.create_permute_vector();
-  Sorter.template sort<KeyViewType>(keys);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, keys);
 
-  Kokkos::parallel_reduce(keys.extent(0), sum3D<ExecutionSpace, KeyType>(keys),
-                          sum_after);
-  Kokkos::parallel_reduce(keys.extent(0) - 1,
-                          bin3d_is_sorted_struct<ExecutionSpace, KeyType>(
-                              keys, bin_1d, min[0], max[0]),
-                          sort_fails);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0)),
+      sum3D<ExecutionSpace, KeyType>(keys), sum_after);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecutionSpace>(exec, 0, keys.extent(0) - 1),
+      bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys, bin_1d, min[0],
+                                                      max[0]),
+      sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -229,36 +237,36 @@ void test_dynamic_view_sort_impl(unsigned int n) {
   KeyViewType keys_view("KeysTmp", n);
 
   // Test sorting array with all numbers equal
-  Kokkos::deep_copy(keys_view, KeyType(1));
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, keys_view, KeyType(1));
   Kokkos::deep_copy(keys, keys_view);
-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);
 
   Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
   Kokkos::fill_random(keys_view, g,
                       Kokkos::Random_XorShift64_Pool<
                           ExecutionSpace>::generator_type::MAX_URAND);
 
-  ExecutionSpace().fence();
+  exec.fence();
   Kokkos::deep_copy(keys, keys_view);
-  // ExecutionSpace().fence();
 
   double sum_before       = 0.0;
   double sum_after        = 0.0;
   unsigned int sort_fails = 0;
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_before);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_before);
 
-  Kokkos::sort(keys, 0 /* begin */, n /* end */);
+  Kokkos::sort(exec, keys, 0 /* begin */, n /* end */);
 
-  ExecutionSpace().fence();  // Need this fence to prevent BusError with Cuda
+  exec.fence();  // Need this fence to prevent BusError with Cuda
   Kokkos::deep_copy(keys_view, keys);
-  // ExecutionSpace().fence();
 
-  Kokkos::parallel_reduce(n, sum<ExecutionSpace, KeyType>(keys_view),
-                          sum_after);
-  Kokkos::parallel_reduce(
-      n - 1, is_sorted_struct<ExecutionSpace, KeyType>(keys_view), sort_fails);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n),
+                          sum<ExecutionSpace, KeyType>(keys_view), sum_after);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(exec, 0, n - 1),
+                          is_sorted_struct<ExecutionSpace, KeyType>(keys_view),
+                          sort_fails);
 
   double ratio   = sum_before / sum_after;
   double epsilon = 1e-10;
@@ -301,9 +309,10 @@ void test_issue_1160_impl() {
   for (int i = 0; i < 10; ++i) {
     h_v.access(i, 0) = h_x.access(i, 0) = double(h_element(i));
   }
-  Kokkos::deep_copy(element_, h_element);
-  Kokkos::deep_copy(x_, h_x);
-  Kokkos::deep_copy(v_, h_v);
+  ExecutionSpace exec;
+  Kokkos::deep_copy(exec, element_, h_element);
+  Kokkos::deep_copy(exec, x_, h_x);
+  Kokkos::deep_copy(exec, v_, h_v);
 
   using KeyViewType = decltype(element_);
   using BinOp       = Kokkos::BinOp1D<KeyViewType>;
@@ -316,15 +325,16 @@ void test_issue_1160_impl() {
 
   Kokkos::BinSort<KeyViewType, BinOp> Sorter(element_, begin, end, binner,
                                              false);
-  Sorter.create_permute_vector();
-  Sorter.sort(element_, begin, end);
+  Sorter.create_permute_vector(exec);
+  Sorter.sort(exec, element_, begin, end);
 
-  Sorter.sort(x_, begin, end);
-  Sorter.sort(v_, begin, end);
+  Sorter.sort(exec, x_, begin, end);
+  Sorter.sort(exec, v_, begin, end);
 
-  Kokkos::deep_copy(h_element, element_);
-  Kokkos::deep_copy(h_x, x_);
-  Kokkos::deep_copy(h_v, v_);
+  Kokkos::deep_copy(exec, h_element, element_);
+  Kokkos::deep_copy(exec, h_x, x_);
+  Kokkos::deep_copy(exec, h_v, v_);
+  exec.fence();
 
   ASSERT_EQ(h_element(0), 9);
   ASSERT_EQ(h_element(1), 8);
diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml
index e8763c0b66..73a0d31875 100644
--- a/lib/kokkos/appveyor.yml
+++ b/lib/kokkos/appveyor.yml
@@ -3,4 +3,8 @@ image:
 clone_folder: c:\projects\source
 build_script:
 - cmd: >-
-    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake
+    mkdir build &&
+    cd build &&
+    cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_3=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF &&
+    cmake --build . --target install &&
+    ctest -C Debug --output-on-failure
diff --git a/lib/kokkos/benchmarks/atomic/main.cpp b/lib/kokkos/benchmarks/atomic/main.cpp
index 7b5caa1aee..cc0d3e41e8 100644
--- a/lib/kokkos/benchmarks/atomic/main.cpp
+++ b/lib/kokkos/benchmarks/atomic/main.cpp
@@ -1,12 +1,12 @@
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_Random.hpp>
 
 template <class Scalar>
 double test_atomic(int L, int N, int M, int K, int R,
                    Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
@@ -28,7 +28,7 @@ template <class Scalar>
 double test_no_atomic(int L, int N, int M, int K, int R,
                       Kokkos::View<const int*> offsets) {
   Kokkos::View<Scalar*> output("Output", N);
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   for (int r = 0; r < R; r++)
     Kokkos::parallel_for(
         L, KOKKOS_LAMBDA(const int& i) {
diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
index 62d7ef4a4c..4fc6ca2c68 100644
--- a/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/bench.hpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 template <class Scalar, int Unroll, int Stride>
 struct Run {
diff --git a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
index 6da2407a08..75f30a3409 100644
--- a/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
+++ b/lib/kokkos/benchmarks/bytes_and_flops/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <bench.hpp>
 #include <cstdlib>
 
diff --git a/lib/kokkos/benchmarks/gather/main.cpp b/lib/kokkos/benchmarks/gather/main.cpp
index 5f10e4dcc1..dd502faaa4 100644
--- a/lib/kokkos/benchmarks/gather/main.cpp
+++ b/lib/kokkos/benchmarks/gather/main.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <gather.hpp>
 #include <cstdlib>
 
diff --git a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
index e7ef67e080..311947c197 100644
--- a/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
+++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cpp
@@ -52,35 +52,33 @@
 
 #define HLINE "-------------------------------------------------------------\n"
 
-#if defined(KOKKOS_ENABLE_CUDA)
-using StreamHostArray   = Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::CudaSpace>;
-#else
-using StreamHostArray   = Kokkos::View<double*, Kokkos::HostSpace>::HostMirror;
-using StreamDeviceArray = Kokkos::View<double*, Kokkos::HostSpace>;
-#endif
+using StreamDeviceArray =
+    Kokkos::View<double*, Kokkos::MemoryTraits<Kokkos::Restrict>>;
+using StreamHostArray = typename StreamDeviceArray::HostMirror;
 
 using StreamIndex = int;
+using Policy      = Kokkos::RangePolicy<Kokkos::IndexType<StreamIndex>>;
 
-double now() {
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-
-  return (double)now.tv_sec + ((double)now.tv_usec * 1.0e-6);
-}
-
-void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b,
-                  StreamDeviceArray& c) {
+void perform_set(StreamDeviceArray& a, const double scalar) {
   Kokkos::parallel_for(
-      "copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i]; });
+      "set", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { a[i] = scalar; });
 
   Kokkos::fence();
 }
 
-void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
-                   StreamDeviceArray& c, const double scalar) {
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b) {
   Kokkos::parallel_for(
-      "copy", a.extent(0),
+      "copy", Policy(0, a.extent(0)),
+      KOKKOS_LAMBDA(const StreamIndex i) { b[i] = a[i]; });
+
+  Kokkos::fence();
+}
+
+void perform_scale(StreamDeviceArray& b, StreamDeviceArray& c,
+                   const double scalar) {
+  Kokkos::parallel_for(
+      "scale", Policy(0, b.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { b[i] = scalar * c[i]; });
 
   Kokkos::fence();
@@ -89,7 +87,7 @@ void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
                  StreamDeviceArray& c) {
   Kokkos::parallel_for(
-      "add", a.extent(0),
+      "add", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { c[i] = a[i] + b[i]; });
 
   Kokkos::fence();
@@ -98,7 +96,7 @@ void perform_add(StreamDeviceArray& a, StreamDeviceArray& b,
 void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b,
                    StreamDeviceArray& c, const double scalar) {
   Kokkos::parallel_for(
-      "triad", a.extent(0),
+      "triad", Policy(0, a.extent(0)),
       KOKKOS_LAMBDA(const StreamIndex i) { a[i] = b[i] + scalar * c[i]; });
 
   Kokkos::fence();
@@ -184,6 +182,7 @@ int run_benchmark() {
 
   const double scalar = 3.0;
 
+  double setTime   = std::numeric_limits<double>::max();
   double copyTime  = std::numeric_limits<double>::max();
   double scaleTime = std::numeric_limits<double>::max();
   double addTime   = std::numeric_limits<double>::max();
@@ -191,13 +190,10 @@ int run_benchmark() {
 
   printf("Initializing Views...\n");
 
-#if defined(KOKKOS_HAVE_OPENMP)
   Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
-#else
-  Kokkos::parallel_for(
-      "init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
-#endif
+      "init",
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,
+                                                             STREAM_ARRAY_SIZE),
       KOKKOS_LAMBDA(const int i) {
         a[i] = 1.0;
         b[i] = 2.0;
@@ -209,26 +205,30 @@ int run_benchmark() {
   Kokkos::deep_copy(dev_b, b);
   Kokkos::deep_copy(dev_c, c);
 
-  double start;
-
   printf("Starting benchmarking...\n");
 
+  Kokkos::Timer timer;
+
   for (StreamIndex k = 0; k < STREAM_NTIMES; ++k) {
-    start = now();
-    perform_copy(dev_a, dev_b, dev_c);
-    copyTime = std::min(copyTime, (now() - start));
+    timer.reset();
+    perform_set(dev_c, 1.5);
+    setTime = std::min(setTime, timer.seconds());
 
-    start = now();
-    perform_scale(dev_a, dev_b, dev_c, scalar);
-    scaleTime = std::min(scaleTime, (now() - start));
+    timer.reset();
+    perform_copy(dev_a, dev_c);
+    copyTime = std::min(copyTime, timer.seconds());
 
-    start = now();
+    timer.reset();
+    perform_scale(dev_b, dev_c, scalar);
+    scaleTime = std::min(scaleTime, timer.seconds());
+
+    timer.reset();
     perform_add(dev_a, dev_b, dev_c);
-    addTime = std::min(addTime, (now() - start));
+    addTime = std::min(addTime, timer.seconds());
 
-    start = now();
+    timer.reset();
     perform_triad(dev_a, dev_b, dev_c, scalar);
-    triadTime = std::min(triadTime, (now() - start));
+    triadTime = std::min(triadTime, timer.seconds());
   }
 
   Kokkos::deep_copy(a, dev_a);
@@ -240,6 +240,9 @@ int run_benchmark() {
 
   printf(HLINE);
 
+  printf("Set             %11.2f MB/s\n",
+         (1.0e-06 * 1.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
+             setTime);
   printf("Copy            %11.2f MB/s\n",
          (1.0e-06 * 2.0 * (double)sizeof(double) * (double)STREAM_ARRAY_SIZE) /
              copyTime);
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
index 6af091a7d8..43f8a745da 100755
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@@ -634,15 +634,15 @@ elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
   > ${HPCBIND_OUT}
   if [[ ${HPCBIND_TEE} -eq 0 ]]; then
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
+      eval "$@" > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
     if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     else
-      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+      eval "$@" > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
     fi
   fi
 fi
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index 4e52e4d09f..27e7d15b9d 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -96,10 +96,10 @@ replace_pragma_ident=0
 first_xcompiler_arg=1
 
 # Allow for setting temp dir without setting TMPDIR in parent (see https://docs.olcf.ornl.gov/systems/summit_user_guide.html#setting-tmpdir-causes-jsm-jsrun-errors-job-state-flip-flop)
-if [[ ! -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
+if [[ -z ${NVCC_WRAPPER_TMPDIR+x} ]]; then
   temp_dir=${TMPDIR:-/tmp}
 else
-  temp_dir=${NVCC_WRAPPER_TMPDIR+x}
+  temp_dir=${NVCC_WRAPPER_TMPDIR}
 fi
 
 # optimization flag added as a command-line argument
@@ -226,14 +226,14 @@ do
     cuda_args="$cuda_args $1"
     ;;
   #Handle more known nvcc args
-  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets)
+  --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets|-allow-unsupported-compiler|--allow-unsupported-compiler)
     cuda_args="$cuda_args $1"
     ;;
   #Handle known nvcc args that have an argument
-  -maxrregcount=*|--maxrregcount=*)
+  -maxrregcount=*|--maxrregcount=*|-time=*)
     cuda_args="$cuda_args $1"
     ;;
-  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include)
+  -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include|-time)
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
@@ -552,14 +552,14 @@ if [ $host_only -eq 1 ]; then
   $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command && $nvcc_depfile_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command"
   fi
-  $nvcc_command && $nvcc_depfile_command
+  TMPDIR=${temp_dir} $nvcc_command && TMPDIR=${temp_dir} $nvcc_depfile_command
 else
   if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
-    echo "$nvcc_command"
+    echo "TMPDIR=${temp_dir} $nvcc_command"
   fi
-  $nvcc_command
+  TMPDIR=${temp_dir} $nvcc_command
 fi
 error_code=$?
 
diff --git a/lib/kokkos/cmake/CTestConfig.cmake.in b/lib/kokkos/cmake/CTestConfig.cmake.in
deleted file mode 100644
index 1f82c0d64d..0000000000
--- a/lib/kokkos/cmake/CTestConfig.cmake.in
+++ /dev/null
@@ -1,91 +0,0 @@
-#----------------------------------------------------------------------------------------#
-#
-#   CTestConfig.cmake template for Kokkos
-#
-#----------------------------------------------------------------------------------------#
-
-#
-#   dash-board related
-#
-set(CTEST_PROJECT_NAME "Kokkos")
-set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC")
-set(CTEST_DROP_METHOD "https")
-set(CTEST_DROP_SITE "cdash.nersc.gov")
-set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}")
-set(CTEST_CDASH_VERSION "1.6")
-set(CTEST_CDASH_QUERY_VERSION TRUE)
-set(CTEST_SUBMIT_RETRY_COUNT "1")
-set(CTEST_SUBMIT_RETRY_DELAY "30")
-
-#
-#   configure/build related
-#
-set(CTEST_BUILD_NAME "@BUILD_NAME@")
-set(CTEST_MODEL "@MODEL@")
-set(CTEST_SITE "@SITE@")
-set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@")
-set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@")
-set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@")
-
-#
-#   configure/build related
-#
-set(CTEST_UPDATE_TYPE "git")
-set(CTEST_UPDATE_VERSION_ONLY ON)
-# set(CTEST_GENERATOR "")
-# set(CTEST_GENERATOR_PLATFORM "")
-
-#
-#   testing related
-#
-set(CTEST_TIMEOUT "7200")
-set(CTEST_TEST_TIMEOUT "7200")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100")
-set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100")
-set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576")
-
-#
-#   coverage related
-#
-set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*")
-
-#
-#   commands
-#
-if(NOT "@CHECKOUT_COMMAND@" STREQUAL "")
-    set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@")
-endif()
-set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@")
-set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@")
-set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@")
-if(NOT WIN32)
-    set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@")
-endif()
-set(CTEST_COVERAGE_COMMAND "gcov")
-set(CTEST_MEMORYCHECK_COMMAND "valgrind")
-set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@")
-
-#
-#   various configs
-#
-set(APPEND_VALUE @APPEND@)
-if(APPEND_VALUE)
-    set(APPEND_CTEST APPEND)
-endif()
-
-macro(SET_TEST_PROP VAR)
-    if(NOT "${ARGS}" STREQUAL "")
-        set(${VAR}_CTEST ${VAR} ${ARGN})
-    endif()
-endmacro()
-
-set_test_prop(START           @START@)
-set_test_prop(END             @END@)
-set_test_prop(STRIDE          @STRIDE@)
-set_test_prop(INCLUDE         @INCLUDE@)
-set_test_prop(EXCLUDE         @EXCLUDE@)
-set_test_prop(INCLUDE_LABEL   @INCLUDE_LABEL@)
-set_test_prop(EXCLUDE_LABEL   @EXCLUDE_LABEL@)
-set_test_prop(PARALLEL_LEVEL  @PARALLEL_LEVEL@)
-set_test_prop(STOP_TIME       @STOP_TIME@)
-set_test_prop(COVERAGE_LABELS @LABELS@)
diff --git a/lib/kokkos/cmake/KokkosCI.cmake b/lib/kokkos/cmake/KokkosCI.cmake
deleted file mode 100644
index e8c9af37ad..0000000000
--- a/lib/kokkos/cmake/KokkosCI.cmake
+++ /dev/null
@@ -1,350 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-message(STATUS "")
-
-get_cmake_property(_cached_vars CACHE_VARIABLES)
-set(KOKKOS_CMAKE_ARGS)
-set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT"
-                       "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE")
-list(SORT _cached_vars)
-foreach(_var ${_cached_vars})
-    if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES)
-        list(APPEND KOKKOS_CMAKE_ARGS ${_var})
-        if("${_var}" STREQUAL "CMAKE_BUILD_TYPE")
-            set(BUILD_TYPE "${CMAKE_BUILD_TYPE}")
-        endif()
-    endif()
-endforeach()
-
-
-#----------------------------------------------------------------------------------------#
-#
-#   Macros and variables
-#
-#----------------------------------------------------------------------------------------#
-
-macro(CHECK_REQUIRED VAR)
-    if(NOT DEFINED ${VAR})
-        message(FATAL_ERROR "Error! Variable '${VAR}' must be defined")
-    endif()
-endmacro()
-
-# require the build name variable
-CHECK_REQUIRED(BUILD_NAME)
-
-# uses all args
-macro(SET_DEFAULT VAR)
-    if(NOT DEFINED ${VAR})
-        set(${VAR} ${ARGN})
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# uses first arg -- useful for selecting via priority from multiple
-# potentially defined variables, e.g.:
-#
-#   set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME})
-#
-macro(SET_DEFAULT_ARG1 VAR)
-    if(NOT DEFINED ${VAR})
-        foreach(_ARG ${ARGN})
-            if(NOT "${_ARG}" STREQUAL "")
-                set(${VAR} ${_ARG})
-                break()
-            endif()
-        endforeach()
-    endif()
-    # remove these ctest configuration variables from the defines
-    # passed to the Kokkos configuration
-    if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS)
-        list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}")
-    endif()
-endmacro()
-
-# determine the default working directory
-if(NOT "$ENV{WORKSPACE}" STREQUAL "")
-    set(WORKING_DIR "$ENV{WORKSPACE}")
-else()
-    get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY)
-endif()
-
-# determine the hostname
-execute_process(COMMAND hostname
-    OUTPUT_VARIABLE HOSTNAME
-    OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}")
-
-# get the number of processors
-include(ProcessorCount)
-ProcessorCount(NUM_PROCESSORS)
-
-# find git
-find_package(Git QUIET)
-if(NOT GIT_EXECUTABLE)
-    unset(GIT_EXECUTABLE CACHE)
-    unset(GIT_EXECUTABLE)
-endif()
-
-function(EXECUTE_GIT_COMMAND VAR)
-    set(${VAR} "" PARENT_SCOPE)
-    execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN}
-        OUTPUT_VARIABLE VAL
-        RESULT_VARIABLE RET
-        OUTPUT_STRIP_TRAILING_WHITESPACE
-        WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR}
-        ERROR_QUIET)
-    string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}")
-    set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE)
-    if(RET EQUAL 0)
-        set(${VAR} "${VAL}" PARENT_SCOPE)
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_BRANCH_NAME VAR)
-    execute_git_command(GIT_BRANCH branch --show-current)
-    set(_INVALID "%D" "HEAD")
-    if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-        execute_git_command(GIT_BRANCH show -s --format=%D)
-        if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID)
-            execute_git_command(GIT_BRANCH --describe all)
-        endif()
-    endif()
-    #
-    if(GIT_BRANCH)
-        string(REPLACE " " ";" _DESC "${GIT_BRANCH}")
-        # just set it to last one via loop instead of wonky cmake index manip
-        foreach(_ITR ${_DESC})
-            set(GIT_BRANCH "${_ITR}")
-        endforeach()
-        set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE)
-        message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}")
-    endif()
-endfunction()
-
-# just gets the git branch name if available
-function(GET_GIT_AUTHOR_NAME VAR)
-    execute_git_command(GIT_AUTHOR show -s --format=%an)
-    if(GIT_AUTHOR)
-        string(LENGTH "${GIT_AUTHOR}" STRLEN)
-        # if the build name gets too long, this can cause submission errors
-        if(STRLEN GREATER 24)
-            # remove middle initial
-            string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}")
-            # get first and sur name
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}")
-            string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}")
-            if(S_NAME)
-                set(GIT_AUTHOR "${S_NAME}")
-            elseif(F_NAME)
-                set(GIT_AUTHOR "${F_NAME}")
-            endif()
-        endif()
-        # remove any spaces, quotes, periods, etc.
-        string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}")
-        set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE)
-        message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}")
-    endif()
-endfunction()
-
-# get the name of the branch
-GET_GIT_BRANCH_NAME(GIT_BRANCH)
-# get the name of the author
-GET_GIT_AUTHOR_NAME(GIT_AUTHOR)
-# author, prefer git method for consistency
-SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR})
-# SLUG == owner_name/repo_name
-SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG})
-# branch name
-SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH})
-# pull request number
-SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM})
-# get the event type, e.g. push, pull_request, api, cron, etc.
-SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE})
-
-if("${BRANCH}" STREQUAL "")
-    message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'")
-    message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=<name>")
-endif()
-
-#----------------------------------------------------------------------------------------#
-#
-#   Set default values if not provided on command-line
-#
-#----------------------------------------------------------------------------------------#
-
-SET_DEFAULT(SOURCE_DIR      "${WORKING_DIR}")           # source directory
-SET_DEFAULT(BINARY_DIR      "${WORKING_DIR}/build")     # build directory
-SET_DEFAULT(BUILD_TYPE      "${CMAKE_BUILD_TYPE}")      # Release, Debug, etc.
-SET_DEFAULT(MODEL           "Continuous")               # Continuous, Nightly, or Experimental
-SET_DEFAULT(JOBS            1)                          # number of parallel ctests
-SET_DEFAULT(CTEST_COMMAND   "${CMAKE_CTEST_COMMAND}")   # just in case
-SET_DEFAULT(CTEST_ARGS      "-V --output-on-failure")   # extra arguments when ctest is called
-SET_DEFAULT(GIT_EXECUTABLE  "git")                      # ctest_update
-SET_DEFAULT(TARGET          "all")                      # build target
-SET_DEFAULT_ARG1(SITE       "$ENV{SITE}"
-                            "${HOSTNAME}")              # update site
-SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}"
-                            "${NUM_PROCESSORS}")        # number of parallel compile jobs
-#
-#   The variable below correspond to ctest arguments, i.e. START,END,STRIDE are
-#   '-I START,END,STRIDE'
-#
-SET_DEFAULT(START           "")
-SET_DEFAULT(END             "")
-SET_DEFAULT(STRIDE          "")
-SET_DEFAULT(INCLUDE         "")
-SET_DEFAULT(EXCLUDE         "")
-SET_DEFAULT(INCLUDE_LABEL   "")
-SET_DEFAULT(EXCLUDE_LABEL   "")
-SET_DEFAULT(PARALLEL_LEVEL  "")
-SET_DEFAULT(STOP_TIME       "")
-SET_DEFAULT(LABELS          "")
-SET_DEFAULT(NOTES           "")
-
-# default static build tag for Nightly
-set(BUILD_TAG "${BRANCH}")
-
-if(NOT BUILD_TYPE)
-    # default for kokkos if not specified
-    set(BUILD_TYPE "RelWithDebInfo")
-endif()
-
-# generate dynamic name if continuous or experimental model
-if(NOT "${MODEL}" STREQUAL "Nightly")
-    if(EVENT_TYPE AND PULL_REQUEST_NUM)
-        # e.g. pull_request/123
-        if(AUTHOR)
-            set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        else()
-            set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}")
-        endif()
-    elseif(SLUG)
-        # e.g. owner_name/repo_name
-        set(BUILD_TAG "${SLUG}")
-    elseif(AUTHOR)
-        set(BUILD_TAG "${AUTHOR}/${BRANCH}")
-    endif()
-    if(EVENT_TYPE AND NOT PULL_REQUEST_NUM)
-        set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}")
-    endif()
-endif()
-
-# unnecessary
-string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}")
-string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}")
-
-message(STATUS "BUILD_TAG: ${BUILD_TAG}")
-
-set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]")
-
-# colons in build name create extra (empty) entries in CDash
-string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}")
-# unnecessary info
-string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}")
-# consistency
-string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}")
-# miscellaneous from missing fields
-string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}")
-string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}")
-
-# check binary directory
-if(EXISTS ${BINARY_DIR})
-    if(NOT IS_DIRECTORY "${BINARY_DIR}")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!")
-    endif()
-    file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*")
-    if(NOT "${BINARY_DIR_FILES}" STREQUAL "")
-        message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!")
-    endif()
-endif()
-
-get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH)
-get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH)
-
-#----------------------------------------------------------------------------------------#
-#
-#   Generate the CTestConfig.cmake
-#
-#----------------------------------------------------------------------------------------#
-
-set(CONFIG_ARGS)
-foreach(_ARG ${KOKKOS_CMAKE_ARGS})
-    if(NOT "${${_ARG}}" STREQUAL "")
-        get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE)
-        if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED")
-            if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF")
-                set(_ARG_TYPE "BOOL")
-            elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "FILEPATH")
-            elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}")
-                set(_ARG_TYPE "PATH")
-            elseif(NOT "${${_ARG}}" STREQUAL "")
-                set(_ARG_TYPE "STRING")
-            endif()
-        endif()
-        set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n")
-    endif()
-endforeach()
-
-file(WRITE ${BINARY_REALDIR}/initial-cache.cmake
-"
-set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\")
-${CONFIG_ARGS}
-")
-
-file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO)
-message(STATUS "Initial cache:\n${_CACHE_INFO}")
-
-# initialize the cache
-set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake")
-
-
-# generate the CTestConfig.cmake
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in
-    ${BINARY_REALDIR}/CTestConfig.cmake
-    @ONLY)
-
-# copy/generate the dashboard script
-configure_file(
-    ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in
-    ${BINARY_REALDIR}/KokkosCTest.cmake
-    @ONLY)
-
-# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake
-execute_process(
-    COMMAND             ${CMAKE_COMMAND} -E touch CTestCustom.cmake
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-#----------------------------------------------------------------------------------------#
-#
-#   Execute CTest
-#
-#----------------------------------------------------------------------------------------#
-
-message(STATUS "")
-message(STATUS "BUILD_NAME: ${BUILD_NAME}")
-message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...")
-message(STATUS "")
-
-# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV"
-string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}")
-
-execute_process(
-    COMMAND             ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}
-    RESULT_VARIABLE     RET
-    WORKING_DIRECTORY   ${BINARY_REALDIR}
-    )
-
-# ensure that any non-zero result variable gets propagated
-if(NOT RET EQUAL 0)
-    message(FATAL_ERROR "CTest return non-zero exit code: ${RET}")
-endif()
diff --git a/lib/kokkos/cmake/KokkosCTest.cmake.in b/lib/kokkos/cmake/KokkosCTest.cmake.in
deleted file mode 100644
index b6917f3cc1..0000000000
--- a/lib/kokkos/cmake/KokkosCTest.cmake.in
+++ /dev/null
@@ -1,261 +0,0 @@
-cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
-
-if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-    include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake")
-endif()
-
-include(ProcessorCount)
-ProcessorCount(CTEST_PROCESSOR_COUNT)
-
-cmake_policy(SET CMP0009 NEW)
-cmake_policy(SET CMP0011 NEW)
-
-# ---------------------------------------------------------------------------- #
-# -- Commands
-# ---------------------------------------------------------------------------- #
-find_program(CTEST_CMAKE_COMMAND    NAMES cmake)
-find_program(CTEST_UNAME_COMMAND    NAMES uname)
-
-find_program(CTEST_BZR_COMMAND      NAMES bzr)
-find_program(CTEST_CVS_COMMAND      NAMES cvs)
-find_program(CTEST_GIT_COMMAND      NAMES git)
-find_program(CTEST_HG_COMMAND       NAMES hg)
-find_program(CTEST_P4_COMMAND       NAMES p4)
-find_program(CTEST_SVN_COMMAND      NAMES svn)
-
-find_program(VALGRIND_COMMAND       NAMES valgrind)
-find_program(GCOV_COMMAND           NAMES gcov)
-find_program(LCOV_COMMAND           NAMES llvm-cov)
-find_program(MEMORYCHECK_COMMAND    NAMES valgrind )
-
-set(MEMORYCHECK_TYPE Valgrind)
-# set(MEMORYCHECK_TYPE Purify)
-# set(MEMORYCHECK_TYPE BoundsChecker)
-# set(MEMORYCHECK_TYPE ThreadSanitizer)
-# set(MEMORYCHECK_TYPE AddressSanitizer)
-# set(MEMORYCHECK_TYPE LeakSanitizer)
-# set(MEMORYCHECK_TYPE MemorySanitizer)
-# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer)
-set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full")
-
-# ---------------------------------------------------------------------------- #
-# -- Settings
-# ---------------------------------------------------------------------------- #
-## -- Process timeout in seconds
-set(CTEST_TIMEOUT           "7200")
-## -- Set output to English
-set(ENV{LC_MESSAGES}        "en_EN" )
-
-
-# ---------------------------------------------------------------------------- #
-# -- Copy ctest configuration file
-# ---------------------------------------------------------------------------- #
-macro(COPY_CTEST_CONFIG_FILES)
-
-    foreach(_FILE CTestConfig.cmake CTestCustom.cmake)
-
-        # if current directory is not binary or source directory
-        if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND
-           NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in current directory
-            if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE})
-                configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        # if source and binary differ
-        elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}")
-
-            # if file exists in source directory but not in binary directory
-            if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND
-               NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE})
-                configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE}
-                    ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY)
-            endif()
-
-        endif()
-    endforeach()
-
-endmacro()
-
-ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}")
-
-message(STATUS "CTEST_MODEL: ${CTEST_MODEL}")
-
-#-------------------------------------------------------------------------#
-# Start
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...")
-message(STATUS "")
-
-ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST}
-    ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY})
-
-
-#-------------------------------------------------------------------------#
-# Config
-#
-copy_ctest_config_files()
-ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}")
-
-
-#-------------------------------------------------------------------------#
-# Update
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...")
-message(STATUS "")
-
-ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}"
-    RETURN_VALUE up_ret)
-
-
-#-------------------------------------------------------------------------#
-# Configure
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...")
-message(STATUS "")
-
-ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}"
-    SOURCE ${CTEST_SOURCE_DIRECTORY}
-    ${APPEND_CTEST}
-    OPTIONS "${CTEST_CONFIGURE_OPTIONS}"
-    RETURN_VALUE config_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo configure log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_configure_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Configure Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Build
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...")
-message(STATUS "")
-
-ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}"
-    ${APPEND_CTEST}
-    RETURN_VALUE build_ret)
-
-
-#-------------------------------------------------------------------------#
-# Echo build log bc Damien wants to delay merging this PR for eternity
-#
-file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log")
-# should only have one but loop just for safety
-foreach(_LOG ${_build_log})
-    file(READ ${_LOG} _LOG_MESSAGE)
-    message(STATUS "Build Log: ${_LOG}")
-    message(STATUS "\n${_LOG_MESSAGE}\n")
-endforeach()
-
-
-#-------------------------------------------------------------------------#
-# Test
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...")
-message(STATUS "")
-
-ctest_test(RETURN_VALUE test_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST}
-    ${STOP_TIME_CTEST}
-    SCHEDULE_RANDOM OFF)
-
-
-#-------------------------------------------------------------------------#
-# Coverage
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...")
-message(STATUS "")
-
-execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS}
-    WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY}
-    ERROR_QUIET)
-
-ctest_coverage(${APPEND_CTEST}
-    ${CTEST_COVERAGE_LABELS}
-    RETURN_VALUE cov_ret)
-
-
-#-------------------------------------------------------------------------#
-# MemCheck
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...")
-message(STATUS "")
-
-ctest_memcheck(RETURN_VALUE mem_ret
-    ${APPEND_CTEST}
-    ${START_CTEST}
-    ${END_CTEST}
-    ${STRIDE_CTEST}
-    ${INCLUDE_CTEST}
-    ${EXCLUDE_CTEST}
-    ${INCLUDE_LABEL_CTEST}
-    ${EXCLUDE_LABEL_CTEST}
-    ${PARALLEL_LEVEL_CTEST})
-
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...")
-message(STATUS "")
-
-file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake")
-foreach(_FILE ${NOTE_FILES})
-    message(STATUS "Including CTest notes files: \"${_FILE}\"...")
-    include("${_FILE}")
-endforeach()
-
-# capture submit error so it doesn't fail because of a submission error
-ctest_submit(RETURN_VALUE submit_ret
-    RETRY_COUNT 2
-    RETRY_DELAY 10
-    CAPTURE_CMAKE_ERROR submit_err)
-
-#-------------------------------------------------------------------------#
-# Submit
-#
-message(STATUS "")
-message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})")
-message(STATUS "")
-
-
-#-------------------------------------------------------------------------#
-# Non-zero exit codes for important errors
-#
-if(NOT config_ret EQUAL 0)
-    message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}")
-endif()
-
-if(NOT build_ret EQUAL 0)
-    message(FATAL_ERROR "Error during build! Exit code: ${build_ret}")
-endif()
-
-if(NOT test_ret EQUAL 0)
-    message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}")
-endif()
diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in
index 3455b0cb42..07baa0a5f0 100644
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@@ -41,6 +41,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HPX_ASYNC_DISPATCH
 #cmakedefine KOKKOS_ENABLE_DEBUG
@@ -49,17 +50,21 @@
 #cmakedefine KOKKOS_ENABLE_COMPILER_WARNINGS
 #cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
 #cmakedefine KOKKOS_ENABLE_TUNING
-#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE
+#cmakedefine KOKKOS_ENABLE_DEPRECATED_CODE_3
+#cmakedefine KOKKOS_ENABLE_DEPRECATION_WARNINGS
 #cmakedefine KOKKOS_ENABLE_LARGE_MEM_TESTS
 #cmakedefine KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK
 #cmakedefine KOKKOS_ENABLE_COMPLEX_ALIGN
-#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
+#cmakedefine KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#cmakedefine KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION  // deprecated
+#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
 
 /* TPL Settings */
 #cmakedefine KOKKOS_ENABLE_HWLOC
 #cmakedefine KOKKOS_USE_LIBRT
 #cmakedefine KOKKOS_ENABLE_HBWSPACE
 #cmakedefine KOKKOS_ENABLE_LIBDL
+#cmakedefine KOKKOS_ENABLE_LIBQUADMATH
 #cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
 
 #cmakedefine KOKKOS_COMPILER_CUDA_VERSION @KOKKOS_COMPILER_CUDA_VERSION@
@@ -79,6 +84,12 @@
 #cmakedefine KOKKOS_ARCH_POWER8
 #cmakedefine KOKKOS_ARCH_POWER9
 #cmakedefine KOKKOS_ARCH_INTEL_GEN
+#cmakedefine KOKKOS_ARCH_INTEL_DG1
+#cmakedefine KOKKOS_ARCH_INTEL_GEN9
+#cmakedefine KOKKOS_ARCH_INTEL_GEN11
+#cmakedefine KOKKOS_ARCH_INTEL_GEN12LP
+#cmakedefine KOKKOS_ARCH_INTEL_XEHP
+#cmakedefine KOKKOS_ARCH_INTEL_GPU
 #cmakedefine KOKKOS_ARCH_KEPLER
 #cmakedefine KOKKOS_ARCH_KEPLER30
 #cmakedefine KOKKOS_ARCH_KEPLER32
@@ -95,6 +106,7 @@
 #cmakedefine KOKKOS_ARCH_VOLTA70
 #cmakedefine KOKKOS_ARCH_VOLTA72
 #cmakedefine KOKKOS_ARCH_TURING75
+#cmakedefine KOKKOS_ARCH_AMPERE
 #cmakedefine KOKKOS_ARCH_AMPERE80
 #cmakedefine KOKKOS_ARCH_AMPERE86
 #cmakedefine KOKKOS_ARCH_AMD_ZEN
diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
index 8d58d96415..0c825c59e0 100644
--- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
+++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake
@@ -29,7 +29,12 @@ ELSE()
 ENDIF()
 
 include(FindPackageHandleStandardArgs)
-FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA DEFAULT_MSG FOUND_CUDART FOUND_CUDA_DRIVER)
+IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CUDA_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1")
+ELSE()
+  SET(KOKKOS_CUDA_ERROR DEFAULT_MSG)
+ENDIF()
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${KOKKOS_CUDA_ERROR} FOUND_CUDART FOUND_CUDA_DRIVER)
 IF (FOUND_CUDA_DRIVER AND FOUND_CUDART)
   KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE
     LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart
diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
new file mode 100644
index 0000000000..be70b711e0
--- /dev/null
+++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake
@@ -0,0 +1 @@
+KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath)
diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake
new file mode 100644
index 0000000000..826f5021d3
--- /dev/null
+++ b/lib/kokkos/cmake/deps/quadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index e8b85542c6..c4637339f3 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -67,8 +67,13 @@ KOKKOS_ARCH_OPTION(ZEN3            HOST "AMD Zen3 architecture")
 KOKKOS_ARCH_OPTION(VEGA900         GPU  "AMD GPU MI25 GFX900")
 KOKKOS_ARCH_OPTION(VEGA906         GPU  "AMD GPU MI50/MI60 GFX906")
 KOKKOS_ARCH_OPTION(VEGA908         GPU  "AMD GPU MI100 GFX908")
+KOKKOS_ARCH_OPTION(VEGA90A         GPU  "" )
 KOKKOS_ARCH_OPTION(INTEL_GEN       GPU  "Intel GPUs Gen9+")
-
+KOKKOS_ARCH_OPTION(INTEL_DG1       GPU  "Intel Iris XeMAX GPU")
+KOKKOS_ARCH_OPTION(INTEL_GEN9      GPU  "Intel GPU Gen9")
+KOKKOS_ARCH_OPTION(INTEL_GEN11     GPU  "Intel GPU Gen11")
+KOKKOS_ARCH_OPTION(INTEL_GEN12LP   GPU  "Intel GPU Gen12LP")
+KOKKOS_ARCH_OPTION(INTEL_XEHP      GPU  "Intel GPU Xe-HP")
 
 
 IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
@@ -76,6 +81,12 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
     "-Wall" "-Wunused-parameter" "-Wshadow" "-pedantic"
     "-Wsign-compare" "-Wtype-limits" "-Wuninitialized")
 
+  # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH
+  IF(Kokkos_ENABLE_LIBQUADMATH)
+    # warning: non-standard suffix on floating constant [-Wpedantic]
+    LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic")
+  ENDIF()
+
   # OpenMPTarget compilers give erroneous warnings about sign comparison in loops
   IF(KOKKOS_ENABLE_OPENMPTARGET)
     LIST(REMOVE_ITEM COMMON_WARNINGS "-Wsign-compare")
@@ -86,7 +97,7 @@ IF(KOKKOS_ENABLE_COMPILER_WARNINGS)
 
   COMPILER_SPECIFIC_FLAGS(
     COMPILER_ID CMAKE_CXX_COMPILER_ID
-    PGI         NO-VALUE-SPECIFIED
+    NVHPC       NO-VALUE-SPECIFIED
     GNU         ${GNU_WARNINGS}
     DEFAULT     ${COMMON_WARNINGS}
   )
@@ -158,16 +169,18 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ARMV80)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ARMV81)
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.1-a
   )
 ENDIF()
@@ -175,8 +188,9 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX)
   SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8-a -mtune=thunderx
   )
 ENDIF()
@@ -184,23 +198,28 @@ ENDIF()
 IF (KOKKOS_ARCH_ARMV8_THUNDERX2)
   SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable
   COMPILER_SPECIFIC_FLAGS(
-    Cray NO-VALUE-SPECIFIED
-    PGI  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    Cray    NO-VALUE-SPECIFIED
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_A64FX)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -march=armv8.2-a+sve
-    Clang -march=armv8.2-a+sve -msve-vector-bits=512
-    GCC -march=armv8.2-a+sve -msve-vector-bits=512
+    Clang   -march=armv8.2-a+sve -msve-vector-bits=512
+    GCC     -march=armv8.2-a+sve -msve-vector-bits=512
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_ZEN)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen
     DEFAULT -march=znver1 -mtune=znver1
   )
   SET(KOKKOS_ARCH_AMD_ZEN  ON)
@@ -209,7 +228,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN2)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver2 -mtune=znver2
   )
   SET(KOKKOS_ARCH_AMD_ZEN2 ON)
@@ -218,7 +239,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_ZEN3)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx2
+    NVHPC   -tp=zen2
     DEFAULT -march=znver3 -mtune=znver3
   )
   SET(KOKKOS_ARCH_AMD_ZEN3 ON)
@@ -227,8 +250,9 @@ ENDIF()
 
 IF (KOKKOS_ARCH_WSM)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xSSE4.2
-    PGI     -tp=nehalem
+    NVHPC   -tp=px
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -msse4.2
   )
@@ -238,8 +262,9 @@ ENDIF()
 IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX)
   SET(KOKKOS_ARCH_AVX ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -mavx
-    PGI     -tp=sandybridge
+    NVHPC   -tp=sandybridge
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -mavx
   )
@@ -248,8 +273,9 @@ ENDIF()
 IF (KOKKOS_ARCH_HSW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2
   )
@@ -258,8 +284,9 @@ ENDIF()
 IF (KOKKOS_ARCH_BDW)
   SET(KOKKOS_ARCH_AVX2 ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX2
-    PGI     -tp=haswell
+    NVHPC   -tp=haswell
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm
   )
@@ -269,8 +296,9 @@ IF (KOKKOS_ARCH_KNL)
   #avx512-mic
   SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xMIC-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=knl
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=knl -mtune=knl
   )
@@ -279,6 +307,7 @@ ENDIF()
 IF (KOKKOS_ARCH_KNC)
   SET(KOKKOS_USE_ISA_KNC ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     DEFAULT -mmic
   )
 ENDIF()
@@ -287,8 +316,9 @@ IF (KOKKOS_ARCH_SKX)
   #avx512-xeon
   SET(KOKKOS_ARCH_AVX512XEON ON)
   COMPILER_SPECIFIC_FLAGS(
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
     Intel   -xCORE-AVX512
-    PGI     NO-VALUE-SPECIFIED
+    NVHPC   -tp=skylake
     Cray    NO-VALUE-SPECIFIED
     DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 -mrtm
   )
@@ -304,7 +334,8 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER7)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   NO-VALUE-SPECIFIED
     DEFAULT -mcpu=power7 -mtune=power7
   )
   SET(KOKKOS_USE_ISA_POWERPCBE ON)
@@ -312,16 +343,16 @@ ENDIF()
 
 IF (KOKKOS_ARCH_POWER8)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr8
     DEFAULT -mcpu=power8 -mtune=power8
   )
 ENDIF()
 
 IF (KOKKOS_ARCH_POWER9)
   COMPILER_SPECIFIC_FLAGS(
-    PGI     NO-VALUE-SPECIFIED
-    NVIDIA  NO-VALUE-SPECIFIED
+    COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
+    NVHPC   -tp=pwr9
     DEFAULT -mcpu=power9 -mtune=power9
   )
 ENDIF()
@@ -368,7 +399,7 @@ ENDIF()
 
 IF (KOKKOS_ENABLE_SYCL)
   COMPILER_SPECIFIC_FLAGS(
-    DEFAULT -fsycl
+    DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int
   )
   COMPILER_SPECIFIC_OPTIONS(
     DEFAULT -fsycl-unnamed-lambda
@@ -443,20 +474,58 @@ ENDFUNCTION()
 CHECK_AMDGPU_ARCH(VEGA900 gfx900) # Radeon Instinct MI25
 CHECK_AMDGPU_ARCH(VEGA906 gfx906) # Radeon Instinct MI50 and MI60
 CHECK_AMDGPU_ARCH(VEGA908 gfx908)
+CHECK_AMDGPU_ARCH(VEGA90A gfx90a)
 
 IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED)
-  MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
-                     "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator)
+    EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS)
+    STRING(LENGTH "${GPU_ARCHS}" len_str)
+    # enumerator always output gfx000 as the first line
+    IF(${len_str} LESS 8)
+      MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                         "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+    ENDIF()
+  ELSE()
+    MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture currently enabled. "
+                       "Please enable one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.")
+  ENDIF()
+ENDIF()
+
+MACRO(CHECK_MULTIPLE_INTEL_ARCH)
+  IF(KOKKOS_ARCH_INTEL_GPU)
+    MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!")
+  ENDIF()
+  SET(KOKKOS_ARCH_INTEL_GPU ON)
+ENDMACRO()
+
+IF(KOKKOS_ARCH_INTEL_GEN)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_DG1)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN9)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN11)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_GEN12LP)
+  CHECK_MULTIPLE_INTEL_ARCH()
+ENDIF()
+IF(KOKKOS_ARCH_INTEL_XEHP)
+  CHECK_MULTIPLE_INTEL_ARCH()
 ENDIF()
 
 IF (KOKKOS_ENABLE_OPENMPTARGET)
   SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG})
   IF (CLANG_CUDA_ARCH)
-    STRING(REPLACE "sm_" "cc" PGI_CUDA_ARCH ${CLANG_CUDA_ARCH})
+    STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH})
     COMPILER_SPECIFIC_FLAGS(
       Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64-nvidia-cuda
-      XL -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
-      PGI -gpu=${PGI_CUDA_ARCH}
+      XL    -qtgtarch=${KOKKOS_CUDA_ARCH_FLAG}
+      NVHPC -gpu=${NVHPC_CUDA_ARCH}
     )
   ENDIF()
   SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG})
@@ -465,7 +534,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
       Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa
     )
   ENDIF()
-  IF (KOKKOS_ARCH_INTEL_GEN)
+  IF (KOKKOS_ARCH_INTEL_GPU)
     COMPILER_SPECIFIC_FLAGS(
       IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__
     )
@@ -485,7 +554,27 @@ IF (KOKKOS_ENABLE_SYCL)
     ENDIF()
   ELSEIF(KOKKOS_ARCH_INTEL_GEN)
     COMPILER_SPECIFIC_FLAGS(
-      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl"
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9-"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN9)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen9"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN11)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen11"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device gen12lp"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_DG1)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device dg1"
+    )
+  ELSEIF(KOKKOS_ARCH_INTEL_XEHP)
+    COMPILER_SPECIFIC_FLAGS(
+      DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device xehp"
     )
   ENDIF()
 ENDIF()
diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake
index 23847263a9..5afed4fb0e 100644
--- a/lib/kokkos/cmake/kokkos_compiler_id.cmake
+++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake
@@ -137,7 +137,7 @@ SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Clang      4.0.0 or higher"
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    GCC        5.3.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    Intel     17.0.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    NVCC      9.2.88 or higher")
-SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      3.8.0 or higher")
+SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    HIPCC      4.2.0 or higher")
 SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n    PGI         17.4 or higher\n")
 
 IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang)
@@ -158,13 +158,23 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA)
   ENDIF()
   SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE)
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
-  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0)
+  IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.2.0)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
 ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
   IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 17.4)
     MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}")
   ENDIF()
+  # Treat PGI internally as NVHPC to simplify handling both compilers.
+  # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is
+  # backward-compatible to pgc++.
+  SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
+ENDIF()
+
+IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID})
+ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI)
+  SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE)
 ENDIF()
 
 STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION})
diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake
index d7f83ddbdf..7fd0794036 100644
--- a/lib/kokkos/cmake/kokkos_enable_devices.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake
@@ -62,7 +62,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID
       Clang      -Xcompiler ${ClangOpenMPFlag}
       IntelLLVM  -Xcompiler -fiopenmp
-      PGI        -Xcompiler -mp
+      NVHPC      -Xcompiler -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -Xcompiler -qsmp=omp
       DEFAULT    -Xcompiler -fopenmp
@@ -72,7 +72,7 @@ IF(KOKKOS_ENABLE_OPENMP)
       Clang      ${ClangOpenMPFlag}
       IntelLLVM  -fiopenmp
       AppleClang -Xpreprocessor -fopenmp
-      PGI        -mp
+      NVHPC      -mp
       Cray       NO-VALUE-SPECIFIED
       XL         -qsmp=omp
       DEFAULT    -fopenmp
@@ -94,7 +94,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET)
     Clang      ${ClangOpenMPFlag} -Wno-openmp-mapping
     IntelLLVM  -fiopenmp -Wno-openmp-mapping
     XL         -qsmp=omp -qoffload -qnoeh
-    PGI        -mp=gpu
+    NVHPC      -mp=gpu
     DEFAULT    -fopenmp
   )
   COMPILER_SPECIFIC_DEFS(
diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake
index 95bce66c7b..4cb8bd20f5 100644
--- a/lib/kokkos/cmake/kokkos_enable_options.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_options.cmake
@@ -26,9 +26,16 @@ KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID)
 # Put a check in just in case people are using this option
 KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE)
 
+# Set the Default for Desul Atomics usage.
+set(_DESUL_ATOMICS_DEFAULT ON)
+
 KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for CUDA")
 KOKKOS_ENABLE_OPTION(CUDA_UVM             OFF "Whether to use unified memory (UM) for CUDA by default")
 KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC   OFF "Whether to use CUDA LDG intrinsics")
+# As of 08/12/2021 CudaMallocAsync causes issues if UCX is used as MPI communication layer.
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC      OFF  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
+KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_3    ON "Whether code deprecated in major release 3 is available" )
+KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
 KOKKOS_ENABLE_OPTION(HPX_ASYNC_DISPATCH   OFF "Whether HPX supports asynchronous dispatch")
 KOKKOS_ENABLE_OPTION(TESTS         OFF  "Whether to build the unit tests")
@@ -50,6 +57,9 @@ KOKKOS_ENABLE_OPTION(TUNING               OFF "Whether to create bindings for tu
 KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops")
 KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER      ON  "Whether to potentially use the launch compiler")
 
+# This option will go away eventually, but allows fallback to old implementation when needed.
+KOKKOS_ENABLE_OPTION(IMPL_DESUL_ATOMICS   ON  "Whether to use desul based atomics - option only during beta")
+
 IF (KOKKOS_ENABLE_CUDA)
   SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}")
 ENDIF()
diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake
index e1a3e5f8bd..02c9a911b1 100644
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET)
 ENDFUNCTION()
 
 FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER)
-  SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
+  SET(COMPILERS NVIDIA NVHPC XL XLClang DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu)
   CMAKE_PARSE_ARGUMENTS(
     PARSE
     "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES"
diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
index 707fb000af..1eb0592c7f 100644
--- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
+++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake
@@ -140,7 +140,7 @@ IF (NOT KOKKOS_CXX_STANDARD_FEATURE)
   IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake)
     kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
-  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI)
+  ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
     INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake)
     kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD})
   ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel)
diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake
index d8d044c9d7..51bad521c4 100644
--- a/lib/kokkos/cmake/kokkos_tpls.cmake
+++ b/lib/kokkos/cmake/kokkos_tpls.cmake
@@ -67,6 +67,12 @@ SET(PTHREAD_DEFAULT OFF)
 ENDIF()
 KOKKOS_TPL_OPTION(PTHREAD ${PTHREAD_DEFAULT} TRIBITS Pthread)
 
+IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath)
+  SET(LIBQUADMATH_DEFAULT ON)
+ELSE()
+  SET(LIBQUADMATH_DEFAULT OFF)
+ENDIF()
+KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath)
 
 #Make sure we use our local FindKokkosCuda.cmake
 KOKKOS_IMPORT_TPL(HPX INTERFACE)
@@ -78,6 +84,7 @@ KOKKOS_IMPORT_TPL(LIBDL)
 KOKKOS_IMPORT_TPL(MEMKIND)
 KOKKOS_IMPORT_TPL(PTHREAD INTERFACE)
 KOKKOS_IMPORT_TPL(ROCM INTERFACE)
+KOKKOS_IMPORT_TPL(LIBQUADMATH)
 
 #Convert list to newlines (which CMake doesn't always like in cache variables)
 STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}")
diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
new file mode 100644
index 0000000000..1f7587da80
--- /dev/null
+++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake
@@ -0,0 +1,46 @@
+# @HEADER
+# ************************************************************************
+#
+#                        Kokkos v. 3.0
+#       Copyright (2020) National Technology & Engineering
+#               Solutions of Sandia, LLC (NTESS).
+#
+# Under the terms of Contract DE-NA0003525 with NTESS,
+# the U.S. Government retains certain rights in this software.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the Corporation nor the names of the
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+#
+# ************************************************************************
+# @HEADER
+
+TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath
+  REQUIRED_HEADERS quadmath.h
+  REQUIRED_LIBS_NAMES quadmath
+)
diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
index 8c507c7662..7ed9a0271a 100644
--- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp
@@ -48,7 +48,7 @@
 #include <Kokkos_DynRankView.hpp>
 #include <vector>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // Compare performance of DynRankView to View, specific focus on the parenthesis
 // operators
diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
index 65de551b27..16b74a4997 100644
--- a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -48,7 +48,7 @@
 #include <vector>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 // This test will simulate global ids
 
diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
index 0f3ba103ef..8a23f59d32 100644
--- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp
@@ -46,7 +46,7 @@
 #define KOKKOS_TEST_SCATTER_VIEW_HPP
 
 #include <Kokkos_ScatterView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Perf {
 
diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
index c31412552a..4547d5c357 100644
--- a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -43,7 +43,7 @@
 #ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 #define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 #include <iostream>
 #include <iomanip>
diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
index ea1d6dde5d..c5b66f05a3 100644
--- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -76,20 +76,25 @@ class Bitset {
   using execution_space = Device;
   using size_type       = unsigned int;
 
-  enum { BIT_SCAN_REVERSE = 1u };
-  enum { MOVE_HINT_BACKWARD = 2u };
+  static constexpr unsigned BIT_SCAN_REVERSE   = 1u;
+  static constexpr unsigned MOVE_HINT_BACKWARD = 2u;
 
-  enum {
-    BIT_SCAN_FORWARD_MOVE_HINT_FORWARD  = 0u,
-    BIT_SCAN_REVERSE_MOVE_HINT_FORWARD  = BIT_SCAN_REVERSE,
-    BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD,
-    BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
-  };
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_FORWARD =
+      BIT_SCAN_REVERSE;
+  static constexpr unsigned BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD =
+      MOVE_HINT_BACKWARD;
+  static constexpr unsigned BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD =
+      BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD;
 
  private:
-  enum { block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT) };
-  enum { block_mask = block_size - 1u };
-  enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
+  enum : unsigned {
+    block_size = static_cast<unsigned>(sizeof(unsigned) * CHAR_BIT)
+  };
+  enum : unsigned { block_mask = block_size - 1u };
+  enum : unsigned {
+    block_shift = Kokkos::Impl::integral_power_of_two(block_size)
+  };
 
  public:
   /// constructor
@@ -317,14 +322,18 @@ class ConstBitset {
   enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
 
  public:
+  KOKKOS_FUNCTION
   ConstBitset() : m_size(0) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(Bitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset(ConstBitset<Device> const& rhs)
       : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {}
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(Bitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
@@ -332,6 +341,7 @@ class ConstBitset {
     return *this;
   }
 
+  KOKKOS_FUNCTION
   ConstBitset<Device>& operator=(ConstBitset<Device> const& rhs) {
     this->m_size   = rhs.m_size;
     this->m_blocks = rhs.m_blocks;
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
index 45710d1f73..f55d0f2b7f 100644
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -597,8 +597,10 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
     if (std::is_same<typename t_host::memory_space,
                      typename t_dev::memory_space>::value) {
-      typename t_dev::execution_space().fence();
-      typename t_host::execution_space().fence();
+      typename t_dev::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
+      typename t_host::execution_space().fence(
+          "Kokkos::DualView<>::sync: fence after syncing DualView");
     }
   }
 
@@ -776,10 +778,11 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
   /// If \c Device is the same as this DualView's device type, then
   /// mark the device's data as modified.  Otherwise, mark the host's
   /// data as modified.
-  template <class Device>
+  template <class Device, class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   void modify() {
     if (modified_flags.data() == nullptr) return;
-    if (impl_dualview_is_single_device::value) return;
     int dev = get_device_side<Device>();
 
     if (dev == 1) {  // if Device is the same as DualView's device type
@@ -811,8 +814,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
 #endif
   }
 
+  template <
+      class Device, class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  void modify() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_host() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(0) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -832,8 +844,17 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_host() {
+    return;
+  }
+
+  template <class Dummy = DualView,
+            std::enable_if_t<!Dummy::impl_dualview_is_single_device::value>* =
+                nullptr>
   inline void modify_device() {
-    if (impl_dualview_is_single_device::value) return;
     if (modified_flags.data() != nullptr) {
       modified_flags(1) =
           (modified_flags(1) > modified_flags(0) ? modified_flags(1)
@@ -853,6 +874,13 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
     }
   }
 
+  template <
+      class Dummy = DualView,
+      std::enable_if_t<Dummy::impl_dualview_is_single_device::value>* = nullptr>
+  inline void modify_device() {
+    return;
+  }
+
   inline void clear_sync_state() {
     if (modified_flags.data() != nullptr)
       modified_flags(1) = modified_flags(0) = 0;
@@ -875,8 +903,15 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
                const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
-    ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-    h_view = create_mirror_view(d_view);
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
+    if (sizeMismatch) {
+      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+      h_view = create_mirror_view(d_view);
+    } else
+      ::Kokkos::deep_copy(d_view, typename t_dev::value_type{});
 
     /* Reset dirty flags */
     if (modified_flags.data() == nullptr) {
@@ -897,41 +932,31 @@ class DualView : public ViewTraits<DataType, Arg1Type, Arg2Type, Arg3Type> {
               const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
               const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) {
+    const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+    const bool sizeMismatch =
+        Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents);
+
     if (modified_flags.data() == nullptr) {
       modified_flags = t_modified_flags("DualView::modified_flags");
     }
     if (modified_flags(1) >= modified_flags(0)) {
       /* Resize on Device */
-      ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-      h_view = create_mirror_view(d_view);
-
-      /* Mark Device copy as modified */
-      modified_flags(1) = modified_flags(1) + 1;
+      if (sizeMismatch) {
+        ::Kokkos::resize(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        h_view = create_mirror_view(d_view);
 
+        /* Mark Device copy as modified */
+        modified_flags(1) = modified_flags(1) + 1;
+      }
     } else {
       /* Realloc on Device */
-
-      ::Kokkos::realloc(d_view, n0, n1, n2, n3, n4, n5, n6, n7);
-
-      const bool sizeMismatch =
-          (h_view.extent(0) != n0) || (h_view.extent(1) != n1) ||
-          (h_view.extent(2) != n2) || (h_view.extent(3) != n3) ||
-          (h_view.extent(4) != n4) || (h_view.extent(5) != n5) ||
-          (h_view.extent(6) != n6) || (h_view.extent(7) != n7);
-      if (sizeMismatch)
+      if (sizeMismatch) {
         ::Kokkos::resize(h_view, n0, n1, n2, n3, n4, n5, n6, n7);
+        d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
 
-      t_host temp_view = create_mirror_view(d_view);
-
-      /* Remap on Host */
-      Kokkos::deep_copy(temp_view, h_view);
-
-      h_view = temp_view;
-
-      d_view = create_mirror_view(typename t_dev::execution_space(), h_view);
-
-      /* Mark Host copy as modified */
-      modified_flags(0) = modified_flags(0) + 1;
+        /* Mark Host copy as modified */
+        modified_flags(0) = modified_flags(0) + 1;
+      }
     }
   }
 
diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
index c6323fef93..b673c53a4e 100644
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@@ -1140,7 +1140,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1154,7 +1155,8 @@ class DynRankView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::DynRankView<>::DynRankView: fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1404,7 +1406,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -1574,7 +1576,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(const DynRankView<LT, LP...>& lhs,
 namespace Kokkos {
 namespace Impl {
 
-template <class OutputView, typename Enable = void>
+template <class OutputView, class Enable = void>
 struct DynRankViewFill {
   using const_value_type = typename OutputView::traits::const_value_type;
 
@@ -1693,9 +1695,11 @@ inline void deep_copy(
                    typename ViewTraits<DT, DP...>::value_type>::value,
       "deep_copy requires non-const type");
 
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence before filling view");
   Kokkos::Impl::DynRankViewFill<DynRankView<DT, DP...> >(dst, value);
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(DynRankView, value_type): fence after filling view");
 }
 
 /** \brief  Deep copy into a value in Host memory from a view.  */
@@ -1711,10 +1715,13 @@ inline void deep_copy(
 
   using src_traits       = ViewTraits<ST, SP...>;
   using src_memory_space = typename src_traits::memory_space;
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence before copying "
+      "value");
   Kokkos::Impl::DeepCopy<HostSpace, src_memory_space>(&dst, src.data(),
                                                       sizeof(ST));
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::deep_copy(value_type, DynRankView): fence after copying value");
 }
 
 //----------------------------------------------------------------------------
@@ -1744,14 +1751,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if ((void*)dst.data() != (void*)src.data()) {
@@ -1762,10 +1769,14 @@ inline void deep_copy(
     // memory then can byte-wise copy
     if (rank(src) == 0 && rank(dst) == 0) {
       using value_type = typename dst_type::value_type;
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-0 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), sizeof(value_type));
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-0 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1787,10 +1798,14 @@ inline void deep_copy(
                dst.extent(6) == src.extent(6) &&
                dst.extent(7) == src.extent(7)) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (std::is_same<
                    typename DstType::traits::value_type,
                    typename SrcType::traits::non_const_value_type>::value &&
@@ -1817,29 +1832,43 @@ inline void deep_copy(
                dst.stride_6() == src.stride_6() &&
                dst.stride_7() == src.stride_7()) {
       const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "copying rank-1 views");
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "copying rank-1 views");
     } else if (DstExecCanAccessSrc) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type>(dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else if (SrcExecCanAccessDst) {
       // Copying data between views in accessible memory spaces and either
       // non-contiguous or incompatible shape.
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence before "
+          "remapping views of incompatible shape");
       Kokkos::Impl::DynRankViewRemap<dst_type, src_type, src_execution_space>(
           dst, src);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence after "
+          "remapping views of incompatible shape");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::Impl::DeepCopy(DynRankView, DynRankView): fence due to same "
+        "src and dst");
   }
 }
 
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index cc949d4c55..4acae56970 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -53,36 +53,203 @@
 namespace Kokkos {
 namespace Experimental {
 
-// Simple metafunction for choosing memory space
-// In the current implementation, if memory_space == CudaSpace,
-// use CudaUVMSpace for the chunk 'array' allocation, which
-// contains will contain pointers to chunks of memory allocated
-// in CudaSpace
 namespace Impl {
-template <class MemSpace>
-struct ChunkArraySpace {
-  using memory_space = MemSpace;
+
+/// Utility class to manage memory for chunked arrays on the host and
+/// device. Allocates/deallocates memory on both the host and device along with
+/// providing utilities for creating mirrors and deep copying between them.
+template <typename MemorySpace, typename ValueType>
+struct ChunkedArrayManager {
+  using value_type   = ValueType;
+  using pointer_type = ValueType*;
+  using track_type   = Kokkos::Impl::SharedAllocationTracker;
+
+  ChunkedArrayManager()                           = default;
+  ChunkedArrayManager(ChunkedArrayManager const&) = default;
+  ChunkedArrayManager(ChunkedArrayManager&&)      = default;
+  ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default;
+  ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default;
+
+  template <typename Space, typename Value>
+  friend struct ChunkedArrayManager;
+
+  template <typename Space, typename Value>
+  inline ChunkedArrayManager(const ChunkedArrayManager<Space, Value>& rhs)
+      : m_valid(rhs.m_valid),
+        m_chunk_max(rhs.m_chunk_max),
+        m_chunks((ValueType**)(rhs.m_chunks)),
+        m_track(rhs.m_track),
+        m_chunk_size(rhs.m_chunk_size) {
+    static_assert(
+        Kokkos::Impl::MemorySpaceAccess<MemorySpace, Space>::assignable,
+        "Incompatible ChunkedArrayManager copy construction");
+  }
+
+  ChunkedArrayManager(const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ private:
+  struct ACCESSIBLE_TAG {};
+  struct INACCESSIBLE_TAG {};
+
+  ChunkedArrayManager(ACCESSIBLE_TAG, pointer_type* arg_chunks,
+                      const unsigned arg_chunk_max)
+      : m_valid(true), m_chunk_max(arg_chunk_max), m_chunks(arg_chunks) {}
+
+  ChunkedArrayManager(INACCESSIBLE_TAG, const unsigned arg_chunk_max,
+                      const unsigned arg_chunk_size)
+      : m_chunk_max(arg_chunk_max), m_chunk_size(arg_chunk_size) {}
+
+ public:
+  template <typename Space, typename Enable_ = void>
+  struct IsAccessibleFrom;
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::true_type {};
+
+  template <typename Space>
+  struct IsAccessibleFrom<
+      Space, typename std::enable_if_t<!Kokkos::Impl::MemorySpaceAccess<
+                 MemorySpace, Space>::accessible>> : std::false_type {};
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    return ChunkedArrayManager<Space, ValueType>{
+        ACCESSIBLE_TAG{}, other.m_chunks, other.m_chunk_max};
+  }
+
+  template <typename Space>
+  static ChunkedArrayManager<Space, ValueType> create_mirror(
+      ChunkedArrayManager<MemorySpace, ValueType> const& other,
+      typename std::enable_if<!IsAccessibleFrom<Space>::value>::type* =
+          nullptr) {
+    using tag_type =
+        typename ChunkedArrayManager<Space, ValueType>::INACCESSIBLE_TAG;
+    return ChunkedArrayManager<Space, ValueType>{tag_type{}, other.m_chunk_max,
+                                                 other.m_chunk_size};
+  }
+
+ public:
+  void allocate_device(const std::string& label) {
+    if (m_chunks == nullptr) {
+      m_chunks = reinterpret_cast<pointer_type*>(MemorySpace().allocate(
+          label.c_str(), (sizeof(pointer_type) * (m_chunk_max + 2))));
+    }
+  }
+
+  void initialize() {
+    for (unsigned i = 0; i < m_chunk_max + 2; i++) {
+      m_chunks[i] = nullptr;
+    }
+    m_valid = true;
+  }
+
+ private:
+  /// Custom destroy functor for deallocating array chunks along with a linked
+  /// allocation
+  template <typename Space>
+  struct Destroy {
+    Destroy()               = default;
+    Destroy(Destroy&&)      = default;
+    Destroy(const Destroy&) = default;
+    Destroy& operator=(Destroy&&) = default;
+    Destroy& operator=(const Destroy&) = default;
+
+    Destroy(std::string label, value_type** arg_chunk,
+            const unsigned arg_chunk_max, const unsigned arg_chunk_size,
+            value_type** arg_linked)
+        : m_label(label),
+          m_chunks(arg_chunk),
+          m_linked(arg_linked),
+          m_chunk_max(arg_chunk_max),
+          m_chunk_size(arg_chunk_size) {}
+
+    void execute() {
+      // Destroy the array of chunk pointers.
+      // Two entries beyond the max chunks are allocation counters.
+      uintptr_t const len =
+          *reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+      for (unsigned i = 0; i < len; i++) {
+        Space().deallocate(m_label.c_str(), m_chunks[i],
+                           sizeof(value_type) * m_chunk_size);
+      }
+      // Destroy the linked allocation if we have one.
+      if (m_linked != nullptr) {
+        Space().deallocate(m_label.c_str(), m_linked,
+                           (sizeof(value_type*) * (m_chunk_max + 2)));
+      }
+    }
+
+    void destroy_shared_allocation() { execute(); }
+
+    std::string m_label;
+    value_type** m_chunks = nullptr;
+    value_type** m_linked = nullptr;
+    unsigned m_chunk_max;
+    unsigned m_chunk_size;
+  };
+
+ public:
+  template <typename Space>
+  void allocate_with_destroy(const std::string& label,
+                             pointer_type* linked_allocation = nullptr) {
+    using destroy_type = Destroy<Space>;
+    using record_type =
+        Kokkos::Impl::SharedAllocationRecord<MemorySpace, destroy_type>;
+
+    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
+    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
+    // Destroy's execute(...) method
+    record_type* const record = record_type::allocate(
+        MemorySpace(), label, (sizeof(pointer_type) * (m_chunk_max + 2)));
+    m_chunks = static_cast<pointer_type*>(record->data());
+    m_track.assign_allocated_record_to_uninitialized(record);
+
+    record->m_destroy = destroy_type(label, m_chunks, m_chunk_max, m_chunk_size,
+                                     linked_allocation);
+  }
+
+  pointer_type* get_ptr() const { return m_chunks; }
+
+  template <typename Space>
+  typename std::enable_if<!IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const& other) {
+    Kokkos::Impl::DeepCopy<Space, MemorySpace>(
+        other.m_chunks, m_chunks, sizeof(pointer_type) * (m_chunk_max + 2));
+  }
+
+  template <typename Space>
+  typename std::enable_if<IsAccessibleFrom<Space>::value>::type deep_copy_to(
+      ChunkedArrayManager<Space, ValueType> const&) {
+    // no-op
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type* operator+(int i) const { return m_chunks + i; }
+
+  KOKKOS_INLINE_FUNCTION
+  pointer_type& operator[](int i) const { return m_chunks[i]; }
+
+  track_type const& track() const { return m_track; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool valid() const { return m_valid; }
+
+ private:
+  bool m_valid           = false;
+  unsigned m_chunk_max   = 0;
+  pointer_type* m_chunks = nullptr;
+  track_type m_track;
+  unsigned m_chunk_size = 0;
 };
 
-#ifdef KOKKOS_ENABLE_CUDA
-template <>
-struct ChunkArraySpace<Kokkos::CudaSpace> {
-  using memory_space = typename Kokkos::CudaUVMSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_HIP
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::HIPSpace> {
-  using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace;
-};
-#endif
-#ifdef KOKKOS_ENABLE_SYCL
-template <>
-struct ChunkArraySpace<Kokkos::Experimental::SYCLDeviceUSMSpace> {
-  using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace;
-};
-#endif
-}  // end namespace Impl
+} /* end namespace Impl */
 
 /** \brief Dynamic views are restricted to rank-one and no layout.
  *         Resize only occurs on host outside of parallel_regions.
@@ -93,6 +260,13 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
  public:
   using traits = Kokkos::ViewTraits<DataType, P...>;
 
+  using value_type   = typename traits::value_type;
+  using device_space = typename traits::memory_space;
+  using host_space =
+      typename Kokkos::Impl::HostMirror<device_space>::Space::memory_space;
+  using device_accessor = Impl::ChunkedArrayManager<device_space, value_type>;
+  using host_accessor   = Impl::ChunkedArrayManager<host_space, value_type>;
+
  private:
   template <class, class...>
   friend class DynamicView;
@@ -108,7 +282,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                 "DynamicView only implemented for non-specialized View type");
 
   template <class Space, bool = Kokkos::Impl::MemorySpaceAccess<
-                             Space, typename traits::memory_space>::accessible>
+                             Space, device_space>::accessible>
   struct verify_space {
     KOKKOS_FORCEINLINE_FUNCTION static void check() {}
   };
@@ -123,9 +297,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   };
 
  private:
-  track_type m_track;
-  typename traits::value_type** m_chunks =
-      nullptr;             // array of pointers to 'chunks' of memory
+  device_accessor m_chunks;
+  host_accessor m_chunks_host;
   unsigned m_chunk_shift;  // ceil(log2(m_chunk_size))
   unsigned m_chunk_mask;   // m_chunk_size - 1
   unsigned m_chunk_max;  // number of entries in the chunk array - each pointing
@@ -173,7 +346,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   KOKKOS_INLINE_FUNCTION
   size_t allocation_extent() const noexcept {
-    uintptr_t n = *reinterpret_cast<const uintptr_t*>(m_chunks + m_chunk_max);
+    uintptr_t n =
+        *reinterpret_cast<const uintptr_t*>(m_chunks_host + m_chunk_max);
     return (n << m_chunk_shift);
   }
 
@@ -183,7 +357,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   KOKKOS_INLINE_FUNCTION
   size_t size() const noexcept {
     size_t extent_0 =
-        *reinterpret_cast<const size_t*>(m_chunks + m_chunk_max + 1);
+        *reinterpret_cast<const size_t*>(m_chunks_host + m_chunk_max + 1);
     return extent_0;
   }
 
@@ -215,10 +389,10 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   // Allocation tracking properties
 
   KOKKOS_INLINE_FUNCTION
-  int use_count() const { return m_track.use_count(); }
+  int use_count() const { return m_chunks_host.track().use_count(); }
 
   inline const std::string label() const {
-    return m_track.template get_label<typename traits::memory_space>();
+    return m_chunks_host.track().template get_label<host_space>();
   }
 
   //----------------------------------------------------------------------
@@ -285,13 +459,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
    *          up to the maximum number of chunks
    * */
   template <typename IntType>
-  inline typename std::enable_if<
-      std::is_integral<IntType>::value &&
-      Kokkos::Impl::MemorySpaceAccess<
-          Kokkos::HostSpace,
-          typename Impl::ChunkArraySpace<
-              typename traits::memory_space>::memory_space>::accessible>::type
-  resize_serial(IntType const& n) {
+  inline void resize_serial(IntType const& n) {
     using local_value_type   = typename traits::value_type;
     using value_pointer_type = local_value_type*;
 
@@ -304,37 +472,40 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
     }
 
     // *m_chunks[m_chunk_max] stores the current number of chunks being used
-    uintptr_t* const pc = reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
-    std::string _label =
-        m_track.template get_label<typename traits::memory_space>();
+    uintptr_t* const pc =
+        reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
+    std::string _label = m_chunks_host.track().template get_label<host_space>();
+
     if (*pc < NC) {
       while (*pc < NC) {
-        m_chunks[*pc] = reinterpret_cast<value_pointer_type>(
-            typename traits::memory_space().allocate(
+        m_chunks_host[*pc] =
+            reinterpret_cast<value_pointer_type>(device_space().allocate(
                 _label.c_str(), sizeof(local_value_type) << m_chunk_shift));
         ++*pc;
       }
     } else {
       while (NC + 1 <= *pc) {
         --*pc;
-        typename traits::memory_space().deallocate(
-            _label.c_str(), m_chunks[*pc],
-            sizeof(local_value_type) << m_chunk_shift);
-        m_chunks[*pc] = nullptr;
+        device_space().deallocate(_label.c_str(), m_chunks_host[*pc],
+                                  sizeof(local_value_type) << m_chunk_shift);
+        m_chunks_host[*pc] = nullptr;
       }
     }
-    // *m_chunks[m_chunk_max+1] stores the 'extent' requested by resize
+    // *m_chunks_host[m_chunk_max+1] stores the 'extent' requested by resize
     *(pc + 1) = n;
+
+    m_chunks_host.deep_copy_to(m_chunks);
   }
 
   KOKKOS_INLINE_FUNCTION bool is_allocated() const {
-    if (m_chunks == nullptr) {
-      return false;
-    } else {
-      // *m_chunks[m_chunk_max] stores the current number of chunks being used
+    if (m_chunks_host.valid()) {
+      // *m_chunks_host[m_chunk_max] stores the current number of chunks being
+      // used
       uintptr_t* const pc =
-          reinterpret_cast<uintptr_t*>(m_chunks + m_chunk_max);
+          reinterpret_cast<uintptr_t*>(m_chunks_host + m_chunk_max);
       return (*(pc + 1) > 0);
+    } else {
+      return false;
     }
   }
 
@@ -349,8 +520,8 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
 
   template <class RT, class... RP>
   DynamicView(const DynamicView<RT, RP...>& rhs)
-      : m_track(rhs.m_track),
-        m_chunks((typename traits::value_type**)rhs.m_chunks),
+      : m_chunks(rhs.m_chunks),
+        m_chunks_host(rhs.m_chunks_host),
         m_chunk_shift(rhs.m_chunk_shift),
         m_chunk_mask(rhs.m_chunk_mask),
         m_chunk_max(rhs.m_chunk_max),
@@ -361,63 +532,6 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                   "Incompatible DynamicView copy construction");
   }
 
-  //----------------------------------------------------------------------
-
-  struct Destroy {
-    using local_value_type = typename traits::value_type;
-    std::string m_label;
-    local_value_type** m_chunks;
-    unsigned m_chunk_max;
-    bool m_destroy;
-    unsigned m_chunk_size;
-
-    // Initialize or destroy array of chunk pointers.
-    // Two entries beyond the max chunks are allocation counters.
-    inline void operator()(unsigned i) const {
-      if (m_destroy && i < m_chunk_max && nullptr != m_chunks[i]) {
-        typename traits::memory_space().deallocate(
-            m_label.c_str(), m_chunks[i],
-            sizeof(local_value_type) * m_chunk_size);
-      }
-      m_chunks[i] = nullptr;
-    }
-
-    void execute(bool arg_destroy) {
-      using Range = Kokkos::RangePolicy<typename HostSpace::execution_space>;
-
-      m_destroy = arg_destroy;
-
-      Kokkos::Impl::ParallelFor<Destroy, Range> closure(
-          *this,
-          Range(0, m_chunk_max + 2));  // Add 2 to 'destroy' extra slots storing
-                                       // num_chunks and extent; previously + 1
-
-      closure.execute();
-
-      typename traits::execution_space().fence();
-      // Impl::ChunkArraySpace< typename traits::memory_space
-      // >::memory_space::execution_space().fence();
-    }
-
-    void construct_shared_allocation() { execute(false); }
-
-    void destroy_shared_allocation() { execute(true); }
-
-    Destroy()               = default;
-    Destroy(Destroy&&)      = default;
-    Destroy(const Destroy&) = default;
-    Destroy& operator=(Destroy&&) = default;
-    Destroy& operator=(const Destroy&) = default;
-
-    Destroy(std::string label, typename traits::value_type** arg_chunk,
-            const unsigned arg_chunk_max, const unsigned arg_chunk_size)
-        : m_label(label),
-          m_chunks(arg_chunk),
-          m_chunk_max(arg_chunk_max),
-          m_destroy(false),
-          m_chunk_size(arg_chunk_size) {}
-  };
-
   /**\brief  Allocation constructor
    *
    *  Memory is allocated in chunks
@@ -427,10 +541,7 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
   explicit inline DynamicView(const std::string& arg_label,
                               const unsigned min_chunk_size,
                               const unsigned max_extent)
-      : m_track(),
-        m_chunks(nullptr)
-        // The chunk size is guaranteed to be a power of two
-        ,
+      :  // The chunk size is guaranteed to be a power of two
         m_chunk_shift(Kokkos::Impl::integral_power_of_two_that_contains(
             min_chunk_size))  // div ceil(log2(min_chunk_size))
         ,
@@ -440,28 +551,22 @@ class DynamicView : public Kokkos::ViewTraits<DataType, P...> {
                     m_chunk_shift)  // max num pointers-to-chunks in array
         ,
         m_chunk_size(2 << (m_chunk_shift - 1)) {
-    using chunk_array_memory_space = typename Impl::ChunkArraySpace<
-        typename traits::memory_space>::memory_space;
-    // A functor to deallocate all of the chunks upon final destruction
-    using record_type =
-        Kokkos::Impl::SharedAllocationRecord<chunk_array_memory_space, Destroy>;
+    m_chunks = device_accessor(m_chunk_max, m_chunk_size);
 
-    // Allocate chunk pointers and allocation counter
-    record_type* const record =
-        record_type::allocate(chunk_array_memory_space(), arg_label,
-                              (sizeof(pointer_type) * (m_chunk_max + 2)));
-    // Allocate + 2 extra slots so that *m_chunk[m_chunk_max] ==
-    // num_chunks_alloc and *m_chunk[m_chunk_max+1] == extent This must match in
-    // Destroy's execute(...) method
-
-    m_chunks = reinterpret_cast<pointer_type*>(record->data());
-
-    record->m_destroy = Destroy(arg_label, m_chunks, m_chunk_max, m_chunk_size);
-
-    // Initialize to zero
-    record->m_destroy.construct_shared_allocation();
-
-    m_track.assign_allocated_record_to_uninitialized(record);
+    if (device_accessor::template IsAccessibleFrom<host_space>::value) {
+      m_chunks.template allocate_with_destroy<device_space>(arg_label);
+      m_chunks.initialize();
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+    } else {
+      m_chunks.allocate_device(arg_label);
+      m_chunks_host =
+          device_accessor::template create_mirror<host_space>(m_chunks);
+      m_chunks_host.template allocate_with_destroy<device_space>(
+          arg_label, m_chunks.get_ptr());
+      m_chunks_host.initialize();
+      m_chunks_host.deep_copy_to(m_chunks);
+    }
   }
 };
 
@@ -487,8 +592,8 @@ inline void deep_copy(const View<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
@@ -512,8 +617,8 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   if (DstExecCanAccessSrc) {
diff --git a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
index fbfaed9b1b..18f026dc6f 100644
--- a/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ErrorReporter.hpp
@@ -187,7 +187,8 @@ template <typename ReportType, typename DeviceType>
 void ErrorReporter<ReportType, DeviceType>::resize(const size_t new_size) {
   m_reports.resize(new_size);
   m_reporters.resize(new_size);
-  typename DeviceType::execution_space().fence();
+  typename DeviceType::execution_space().fence(
+      "Kokkos::Experimental::ErrorReporter::resize: fence after resizing");
 }
 
 }  // namespace Experimental
diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
index 0f21a08ba3..57bf745d40 100644
--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
@@ -116,8 +116,7 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds(
       This check should cover the case of Views that don't
       have the Unmanaged trait but were initialized by pointer. */
     if (tracker.has_record()) {
-      Kokkos::Impl::operator_bounds_error_on_device<MapType>(
-          map, Kokkos::Impl::has_printable_label_typedef<MapType>());
+      Kokkos::Impl::operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("OffsetView bounds error");
     }
@@ -1244,7 +1243,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from usng Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence before UVM allocation");
     }
 #endif
     //------------------------------------------------------------
@@ -1256,7 +1256,8 @@ class OffsetView : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::OffsetView::OffsetView(): fence after UVM allocation");
     }
 #endif
     //------------------------------------------------------------
diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
index dcd4cf73e5..79bc43b739 100644
--- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp
@@ -834,7 +834,7 @@ class ScatterView<DataType, Layout, DeviceType, Op, ScatterNonDuplicated,
     static_assert(std::is_same<typename dest_type::array_layout, Layout>::value,
                   "ScatterView contribute destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView contribute destination memory space not accessible");
     if (dest.data() == internal_view.data()) return;
@@ -1061,7 +1061,7 @@ class ScatterView<DataType, Kokkos::LayoutRight, DeviceType, Op,
                                Kokkos::LayoutRight>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     bool is_equal = (dest.data() == internal_view.data());
@@ -1290,7 +1290,7 @@ class ScatterView<DataType, Kokkos::LayoutLeft, DeviceType, Op,
                                Kokkos::LayoutLeft>::value,
                   "ScatterView deep_copy destination has different layout");
     static_assert(
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             execution_space, typename dest_type::memory_space>::accessible,
         "ScatterView deep_copy destination memory space not accessible");
     auto extent   = internal_view.extent(internal_view_type::rank - 1);
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 81be3ee2d3..cd633e4031 100644
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -405,7 +405,9 @@ class StaticCrsGraph {
     Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning",
                          Kokkos::RangePolicy<execution_space>(0, numRows()),
                          partitioner);
-    typename device_type::execution_space().fence();
+    typename device_type::execution_space().fence(
+        "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after "
+        "partition");
 
     row_block_offsets = block_offsets;
   }
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
index edb0e7261d..a1601eee35 100644
--- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -345,7 +345,8 @@ class UnorderedMap {
       const impl_value_type tmp = impl_value_type();
       Kokkos::deep_copy(m_values, tmp);
     }
-    { Kokkos::deep_copy(m_scalars, 0); }
+    Kokkos::deep_copy(m_scalars, 0);
+    m_size = 0;
   }
 
   KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const {
@@ -393,9 +394,9 @@ class UnorderedMap {
   ///
   /// This method has undefined behavior when erasable() is true.
   ///
-  /// Note that this is not a device function; it cannot be called in
+  /// Note that this is <i>not</i> a device function; it cannot be called in
   /// a parallel kernel.  The value is not stored as a variable; it
-  /// must be computed.
+  /// must be computed. m_size is a mutable cache of that value.
   size_type size() const {
     if (capacity() == 0u) return 0u;
     if (modified()) {
@@ -419,9 +420,13 @@ class UnorderedMap {
   bool begin_erase() {
     bool result = !erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence before setting erasable "
+          "flag");
       set_flag(erasable_idx);
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::begin_erase: fence after setting erasable "
+          "flag");
     }
     return result;
   }
@@ -429,10 +434,12 @@ class UnorderedMap {
   bool end_erase() {
     bool result = erasable();
     if (is_insertable_map && result) {
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence before erasing");
       Impl::UnorderedMapErase<declared_map_type> f(*this);
       f.apply();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::UnorderedMap::end_erase: fence after erasing");
       reset_flag(erasable_idx);
     }
     return result;
diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp
index a1fbba6b21..88721bd89e 100644
--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@@ -119,12 +119,14 @@ class vector : public DualView<Scalar*, LayoutLeft, Arg1Type> {
     if (DV::template need_sync<typename DV::t_dev::device_type>()) {
       set_functor_host f(DV::h_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_host::execution_space().fence();
+      typename DV::t_host::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_host::device_type>();
     } else {
       set_functor f(DV::d_view, val);
       parallel_for("Kokkos::vector::assign", n, f);
-      typename DV::t_dev::execution_space().fence();
+      typename DV::t_dev::execution_space().fence(
+          "Kokkos::vector::assign: fence after assigning values");
       DV::template modify<typename DV::t_dev::device_type>();
     }
   }
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
index 6047e60f3d..9512f2d4a2 100644
--- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -57,22 +57,10 @@
 namespace Kokkos {
 namespace Impl {
 
-KOKKOS_FORCEINLINE_FUNCTION
-unsigned rotate_left(unsigned i, int r) {
-  constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  return r ? ((i << r) | (i >> (size - r))) : i;
-}
-
 KOKKOS_FORCEINLINE_FUNCTION
 unsigned rotate_right(unsigned i, int r) {
   constexpr int size = static_cast<int>(sizeof(unsigned) * CHAR_BIT);
-  // FIXME_SYCL llvm.fshr.i32 missing
-  // (https://github.com/intel/llvm/issues/3308)
-#ifdef __SYCL_DEVICE_ONLY__
-  return rotate_left(i, size - r);
-#else
   return r ? ((i >> r) | (i << (size - r))) : i;
-#endif
 }
 
 template <typename Bitset>
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
index 367ab33857..fdd78e4e5f 100644
--- a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -75,7 +75,7 @@ uint32_t fmix32(uint32_t h) {
 
 KOKKOS_INLINE_FUNCTION
 uint32_t MurmurHash3_x86_32(const void* key, int len, uint32_t seed) {
-  const uint8_t* data = (const uint8_t*)key;
+  const uint8_t* data = static_cast<const uint8_t*>(key);
   const int nblocks   = len / 4;
 
   uint32_t h1 = seed;
diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp
index 3eee85ed10..e22564aa5c 100644
--- a/lib/kokkos/containers/unit_tests/TestDualView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@@ -49,7 +49,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_DualView.hpp>
 
 namespace Test {
diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
index dd0199ed81..a8d62bd24c 100644
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@@ -702,6 +702,11 @@ class TestDynViewAPI {
 
   using View0 = Kokkos::View<T, device>;
   using View1 = Kokkos::View<T*, device>;
+  using View2 = Kokkos::View<T**, device>;
+  using View3 = Kokkos::View<T***, device>;
+  using View4 = Kokkos::View<T****, device>;
+  using View5 = Kokkos::View<T*****, device>;
+  using View6 = Kokkos::View<T******, device>;
   using View7 = Kokkos::View<T*******, device>;
 
   using host_view_space = typename View0::host_mirror_space;
@@ -1065,7 +1070,7 @@ class TestDynViewAPI {
 
     dView0 d_uninitialized(
         Kokkos::view_alloc(Kokkos::WithoutInitializing, "uninit"), 10, 20);
-    ASSERT_TRUE(d_uninitialized.data() != nullptr);
+    ASSERT_NE(d_uninitialized.data(), nullptr);
     ASSERT_EQ(d_uninitialized.rank(), 2);
     ASSERT_EQ(d_uninitialized.extent(0), 10);
     ASSERT_EQ(d_uninitialized.extent(1), 20);
@@ -1075,14 +1080,14 @@ class TestDynViewAPI {
     hView0 hx, hy, hz;
 
     ASSERT_TRUE(Kokkos::is_dyn_rank_view<dView0>::value);
-    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double> >::value);
+    ASSERT_FALSE(Kokkos::is_dyn_rank_view<Kokkos::View<double>>::value);
 
-    ASSERT_TRUE(dx.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dy.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(dz.data() == nullptr);  // Okay with UVM
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dy.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(dz.data(), nullptr);  // Okay with UVM
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dy.extent(0), 0u);  // Okay with UVM
     ASSERT_EQ(dz.extent(0), 0u);  // Okay with UVM
@@ -1153,11 +1158,11 @@ class TestDynViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1317,17 +1322,17 @@ class TestDynViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dy = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
     dz = dView0();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
 
     // View - DynRankView Interoperability tests
     // deep_copy from view to dynrankview
@@ -1367,7 +1372,7 @@ class TestDynViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::DynRankView<const DataType, device>& arg_const,
       const Kokkos::DynRankView<DataType, device>& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_allocated() {
@@ -1396,8 +1401,8 @@ class TestDynViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1406,7 +1411,7 @@ class TestDynViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc ; // setting non-const from const must not compile
@@ -1659,29 +1664,29 @@ class TestDynViewAPI {
     const_svector_right_type cvr3 =
         Kokkos::subdynrankview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_svector_type c_cv1(v1);
     typename svector_type::const_type c_cv2(v2);
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index f018793dd6..023bf92f62 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -52,7 +52,7 @@
 #include <Kokkos_Core.hpp>
 
 #include <Kokkos_DynamicView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace Test {
 
diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
index 9ddc226e29..24a43e1ebc 100644
--- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
@@ -50,7 +50,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <cstdio>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <Kokkos_OffsetView.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 
diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
index fdbce2d492..342ce2af48 100644
--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@@ -118,11 +118,51 @@ struct test_scatter_view_impl_cls<DeviceType, Layout, Duplication, Contribution,
       scatter_access(k, 3)++;
       scatter_access(k, 4)--;
       scatter_access(k, 5) -= 5;
+// Workaround Intel 17 compiler bug which sometimes add random
+// instruction alignment which makes the lock instruction
+// illegal. Seems to be mostly just for unsigned int atomics.
+// Looking at the assembly the compiler
+// appears to insert cache line alignment for the instruction.
+// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
+// different code. Another occurrence was with Desul atomics in
+// a different unit test. This one here happens without desul atomics.
+// Inserting an assembly nop instruction changes the alignment and
+// works round this.
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 6) += 2;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 7)++;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access_atomic(k, 8)--;
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       --scatter_access_atomic(k, 9);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       ++scatter_access_atomic(k, 10);
+#ifdef KOKKOS_COMPILER_INTEL
+#if (KOKKOS_COMPILER_INTEL < 1800)
+      asm volatile("nop\n");
+#endif
+#endif
       scatter_access(k, 11) -= 3;
     }
   }
diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
index a9a178f95e..c9a3eed90c 100644
--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -180,8 +180,6 @@ void run_test_graph3(size_t B, size_t N) {
 
   std::vector<size_t> sizes(LENGTH);
 
-  size_t total_length = 0;
-
   for (size_t i = 0; i < LENGTH; ++i) {
     sizes[i] = rand() % 1000;
   }
@@ -189,10 +187,6 @@ void run_test_graph3(size_t B, size_t N) {
   sizes[1]    = N;
   sizes[1998] = N;
 
-  for (size_t i = 0; i < LENGTH; ++i) {
-    total_length += sizes[i];
-  }
-
   int C    = 0;
   dView dx = Kokkos::create_staticcrsgraph<dView>("test", sizes);
   dx.create_block_partitioning(B, C);
diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
index 4413cfbc80..8009b99656 100644
--- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -295,10 +295,8 @@ void test_deep_copy(uint32_t num_nodes) {
 }
 
 // FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs
-// FIXME_HIP
 // WORKAROUND MSVC
-#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \
-    !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
+#if !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL)
 TEST(TEST_CATEGORY, UnorderedMap_insert) {
   for (int i = 0; i < 500; ++i) {
     test_insert<TEST_EXECSPACE>(100000, 90000, 100, true);
@@ -329,6 +327,23 @@ TEST(TEST_CATEGORY, UnorderedMap_valid_empty) {
   ASSERT_TRUE(n.is_allocated());
 }
 
+TEST(TEST_CATEGORY, UnorderedMap_clear_zero_size) {
+  using Map =
+      Kokkos::UnorderedMap<int, void, Kokkos::DefaultHostExecutionSpace>;
+
+  Map m(11);
+  ASSERT_EQ(0u, m.size());
+
+  m.insert(2);
+  m.insert(3);
+  m.insert(5);
+  m.insert(7);
+  ASSERT_EQ(4u, m.size());
+
+  m.clear();
+  ASSERT_EQ(0u, m.size());
+}
+
 }  // namespace Test
 
 #endif  // KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
deleted file mode 100644
index f0835772b8..0000000000
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ /dev/null
@@ -1,104 +0,0 @@
-/* The trivial 'src/build_common.sh' creates a config
- * that must stay in sync with this file.
- */
-#cmakedefine KOKKOS_FOR_SIERRA
-
-#if !defined(KOKKOS_FOR_SIERRA)
-
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error \
-    "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA
-#cmakedefine KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_OPENMP
-#cmakedefine KOKKOS_ENABLE_THREADS
-#cmakedefine KOKKOS_ENABLE_SERIAL
-#cmakedefine KOKKOS_ENABLE_Winthread
-
-#cmakedefine KOKKOS_ENABLE_HWLOC
-#cmakedefine KOKKOS_ENABLE_HBWSPACE
-#cmakedefine KOKKOS_ENABLE_LIBRT
-
-#cmakedefine KOKKOS_ENABLE_DEBUG
-#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
-#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
-#cmakedefine KOKKOS_ENABLE_TUNING
-
-#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
-
-#ifdef KOKKOS_ENABLE_CUDA
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
-
-// mfh 16 Sep 2014: If passed in on the command line, that overrides
-// any value of KOKKOS_USE_CUDA_UVM here.  Doing this should prevent build
-// warnings like this one:
-//
-// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning:
-// "KOKKOS_USE_CUDA_UVM" redefined
-//
-// At some point, we should edit the test-build scripts in
-// Trilinos/cmake/ctest/drivers/perseus/, and take
-// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there.  I
-// hesitate to do that now, because I'm not sure if all the files are
-// including KokkosCore_config.h (or a header file that includes it) like
-// they should.
-#ifndef KOKKOS_USE_CUDA_UVM
-#cmakedefine KOKKOS_USE_CUDA_UVM
-#endif
-
-#cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
-
-#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA
-
-#endif
-
-#cmakedefine KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
-
-#ifndef __CUDA_ARCH__
-#cmakedefine KOKKOS_ENABLE_ISA_X86_64
-#cmakedefine KOKKOS_ENABLE_ISA_KNC
-#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
-#endif
-
-#ifdef KOKKOS_ENABLE_HIP
-#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
-#endif
-
-#cmakedefine KOKKOS_ARCH_ARMV80 1
-#cmakedefine KOKKOS_ARCH_ARMV81 1
-#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
-#cmakedefine KOKKOS_ARCH_AVX 1
-#cmakedefine KOKKOS_ARCH_AVX2 1
-#cmakedefine KOKKOS_ARCH_AVX512MIC 1
-#cmakedefine KOKKOS_ARCH_AVX512XEON 1
-#cmakedefine KOKKOS_ARCH_KNC 1
-#cmakedefine KOKKOS_ARCH_POWER8 1
-#cmakedefine KOKKOS_ARCH_POWER9 1
-#cmakedefine KOKKOS_ARCH_KEPLER 1
-#cmakedefine KOKKOS_ARCH_KEPLER30 1
-#cmakedefine KOKKOS_ARCH_KEPLER32 1
-#cmakedefine KOKKOS_ARCH_KEPLER35 1
-#cmakedefine KOKKOS_ARCH_KEPLER37 1
-#cmakedefine KOKKOS_ARCH_MAXWELL 1
-#cmakedefine KOKKOS_ARCH_MAXWELL50 1
-#cmakedefine KOKKOS_ARCH_MAXWELL52 1
-#cmakedefine KOKKOS_ARCH_MAXWELL53 1
-#cmakedefine KOKKOS_ARCH_PASCAL 1
-#cmakedefine KOKKOS_ARCH_PASCAL60 1
-#cmakedefine KOKKOS_ARCH_PASCAL61 1
-#cmakedefine KOKKOS_ARCH_VOLTA70 1
-
-// TODO: These are currently not used in Kokkos.  Should they be removed?
-#cmakedefine KOKKOS_ENABLE_MPI
-#cmakedefine KOKKOS_ENABLE_CUSPARSE
-
-// TODO: No longer options in Kokkos.  Need to be removed.
-#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
-
-#endif  // !defined(KOKKOS_FOR_SIERRA)
diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt
index 9ff4b6006d..a7c57a9434 100644
--- a/lib/kokkos/core/perf_test/CMakeLists.txt
+++ b/lib/kokkos/core/perf_test/CMakeLists.txt
@@ -10,9 +10,7 @@
 #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
 
 # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests.
-IF (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI
-         OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   RETURN()
 ENDIF()
 
diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
index dee21fd7a5..b534c32c52 100644
--- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp
@@ -231,7 +231,7 @@ void run_test_gramschmidt(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_gramschmidt << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << ", " << avg_seconds << std::endl;
   }
 }
 
diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
index c431c2b0c8..24c1898e0a 100644
--- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp
@@ -280,7 +280,7 @@ void run_test_hexgrad(int exp_beg, int exp_end, int num_trials,
 
     std::cout << label_hexgrad << " , " << parallel_work_length << " , "
               << min_seconds << " , " << (min_seconds / parallel_work_length)
-              << std::endl;
+              << avg_seconds << std::endl;
   }
 }
 
diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
index 50bbc78a6b..5b7c2a7a03 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp
@@ -205,7 +205,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time RangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -238,7 +238,7 @@ TEST(default_exec, overlap_range_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -280,7 +280,7 @@ TEST(default_exec, overlap_range_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time RangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -378,7 +378,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time MDRangePolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -413,7 +413,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
 
   timer.reset();
@@ -459,7 +459,7 @@ TEST(default_exec, overlap_mdrange_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time MDRangePolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
@@ -548,7 +548,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_end = timer.seconds();
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE((time_end > 1.5 * time_overlap));
+    ASSERT_GT(time_end, 1.5 * time_overlap);
   }
   printf("Time TeamPolicy: NonOverlap: %lf Time Overlap: %lf\n", time_end,
          time_overlap);
@@ -581,7 +581,7 @@ TEST(default_exec, overlap_team_policy) {
   double time_not_fenced = timer.seconds();
   Kokkos::fence();
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_fenced > 2.0 * time_not_fenced);
+    ASSERT_GT(time_fenced, 2.0 * time_not_fenced);
   }
   timer.reset();
   Kokkos::parallel_reduce(
@@ -622,7 +622,7 @@ TEST(default_exec, overlap_team_policy) {
   ASSERT_EQ(h_result2(), h_result());
 
   if (SpaceInstance<TEST_EXECSPACE>::overlap()) {
-    ASSERT_TRUE(time_overlapped_reduce < 1.5 * time_no_overlapped_reduce);
+    ASSERT_LT(time_overlapped_reduce, 1.5 * time_no_overlapped_reduce);
   }
   printf("Time TeamPolicy Reduce: NonOverlap: %lf Time Overlap: %lf\n",
          time_no_overlapped_reduce, time_overlapped_reduce);
diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
index 550316bec9..555a05ea27 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewAllocate.cpp
@@ -120,7 +120,8 @@ void run_allocateview_tests(int N, int R) {
   {
     Kokkos::Timer timer;
     for (int r = 0; r < R; r++) {
-      double* a_ptr = (double*)Kokkos::kokkos_malloc("A", sizeof(double) * N8);
+      double* a_ptr =
+          static_cast<double*>(Kokkos::kokkos_malloc("A", sizeof(double) * N8));
       Kokkos::parallel_for(
           N8, KOKKOS_LAMBDA(const int& i) { a_ptr[i] = 0.0; });
       Kokkos::fence();
diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
index afeeb64356..b0562f2fd1 100644
--- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
+++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_8.cpp
@@ -47,10 +47,18 @@
 namespace Test {
 
 TEST(default_exec, ViewResize_Rank8) {
+// FIXME_SYCL Avoid running out of resources on the CUDA GPU used in the CI
+#ifdef KOKKOS_ENABLE_SYCL
+  printf("Resize View Performance for LayoutLeft:\n");
+  run_resizeview_tests8<Kokkos::LayoutLeft>(9, 1);
+  printf("Resize View Performance for LayoutRight:\n");
+  run_resizeview_tests8<Kokkos::LayoutRight>(9, 1);
+#else
   printf("Resize View Performance for LayoutLeft:\n");
   run_resizeview_tests8<Kokkos::LayoutLeft>(10, 1);
   printf("Resize View Performance for LayoutRight:\n");
   run_resizeview_tests8<Kokkos::LayoutRight>(10, 1);
+#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
index 59820f3bdd..54824e5b39 100644
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -47,7 +47,7 @@
 #include <cstdlib>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
@@ -401,7 +401,7 @@ template <class T>
 void Loop(int loop, int test, const char* type_name) {
   LoopVariant<T>(loop, test);
 
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
   T res       = LoopVariant<T>(loop, test);
   double time = timer.seconds();
 
diff --git a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
index eec1c8eacc..4086ef5816 100644
--- a/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic_minmax_simple.cpp
@@ -12,13 +12,13 @@
 #include <typeinfo>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using exec_space = Kokkos::DefaultExecutionSpace;
 
 template <typename T>
 void test(const int length) {
-  Kokkos::Impl::Timer timer;
+  Kokkos::Timer timer;
 
   using vector = Kokkos::View<T*, exec_space>;
 
diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp
index 9aab119774..7887d4ba55 100644
--- a/lib/kokkos/core/perf_test/test_mempool.cpp
+++ b/lib/kokkos/core/perf_test/test_mempool.cpp
@@ -48,7 +48,7 @@
 #include <limits>
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace   = Kokkos::DefaultExecutionSpace;
 using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space;
@@ -100,7 +100,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+      ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
       if (ptrs(j)) ++update;
     }
@@ -129,7 +129,7 @@ struct TestFunctor {
 
       const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-      pool.deallocate((void*)ptrs(j), size_alloc);
+      pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
     }
   }
 
@@ -153,9 +153,9 @@ struct TestFunctor {
         for (unsigned k = 0; k < repeat_inner; ++k) {
           const unsigned size_alloc = chunk * (1 + (j % chunk_span));
 
-          pool.deallocate((void*)ptrs(j), size_alloc);
+          pool.deallocate(reinterpret_cast<void*>(ptrs(j)), size_alloc);
 
-          ptrs(j) = (uintptr_t)pool.allocate(size_alloc);
+          ptrs(j) = reinterpret_cast<uintptr_t>(pool.allocate(size_alloc));
 
           if (0 == ptrs(j)) update++;
         }
@@ -266,7 +266,7 @@ int main(int argc, char* argv[]) {
     TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc,
                         fill_stride, chunk_span, repeat_inner);
 
-    Kokkos::Impl::Timer timer;
+    Kokkos::Timer timer;
 
     if (!functor.test_fill()) {
       Kokkos::abort("fill ");
diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp
index b2f936a955..49957ae932 100644
--- a/lib/kokkos/core/perf_test/test_taskdag.cpp
+++ b/lib/kokkos/core/perf_test/test_taskdag.cpp
@@ -56,7 +56,7 @@ int main() { return 0; }
 #include <cstdlib>
 #include <limits>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 using ExecSpace = Kokkos::DefaultExecutionSpace;
 
@@ -220,7 +220,7 @@ int main(int argc, char* argv[]) {
     double time_sum = 0;
 
     for (int i = 0; i < test_repeat_outer; ++i) {
-      Kokkos::Impl::Timer timer;
+      Kokkos::Timer timer;
 
       Functor::FutureType ftmp =
           Kokkos::host_spawn(Kokkos::TaskSingle(sched), Functor(fib_input));
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
index 2ab0989805..499736c60d 100644
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -9,6 +9,8 @@ INSTALL (DIRECTORY
   "${CMAKE_CURRENT_SOURCE_DIR}/"
   DESTINATION ${KOKKOS_HEADER_DIR}
   FILES_MATCHING
+  PATTERN "*.inc"
+  PATTERN "*.inc_*"
   PATTERN "*.hpp"
   PATTERN "*.h"
 )
@@ -65,6 +67,15 @@ IF (KOKKOS_ENABLE_SYCL)
   APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp)
 ENDIF()
 
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS)
+  APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/desul/src/*.cpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.hpp)
+  APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/desul/*/*/*.inc)
+ENDIF()
+
+
 KOKKOS_ADD_LIBRARY(
   kokkoscore
   SOURCES ${KOKKOS_CORE_SRCS}
@@ -86,3 +97,15 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD)
 KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM)
+
+# FIXME: We need a proper solution to figure out whether to enable
+#        libatomic
+# XL requires libatomic even for 64 bit CAS, most others only for 128
+# I (CT) had removed 128bit CAS from desul to not need libatomic.
+IF (KOKKOS_ENABLE_IMPL_DESUL_ATOMICS AND
+    (KOKKOS_ENABLE_OPENMPTARGET OR (CMAKE_CXX_COMPILER_ID STREQUAL XLClang)))
+  target_link_libraries(kokkoscore PUBLIC atomic)
+ENDIF()
+
+
+KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH)
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
index 916f109758..f6b2762403 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -90,43 +90,25 @@ static std::atomic<int> num_uvm_allocations(0);
 
 }  // namespace
 
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
+void DeepCopyCuda(void *dst, const void *src, size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
 }
 
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(void *dst, const void *src,
-                                               size_t n) {
-  CUDA_SAFE_CALL(cudaMemcpy(dst, src, n, cudaMemcpyDefault));
-}
-
-DeepCopy<CudaSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<HostSpace, CudaSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
-      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
-}
-
-DeepCopy<CudaSpace, HostSpace, Cuda>::DeepCopy(const Cuda &instance, void *dst,
-                                               const void *src, size_t n) {
-  CUDA_SAFE_CALL(
+void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src,
+                       size_t n) {
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, instance.cuda_stream()));
 }
 
 void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
   cudaStream_t s = cuda_get_deep_copy_stream();
-  CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
-  cudaStreamSynchronize(s);
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s));
+  Impl::cuda_stream_synchronize(
+      s,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      "Kokkos::Impl::DeepCopyAsyncCuda: Deep Copy Stream Sync");
 }
 
 }  // namespace Impl
@@ -137,6 +119,7 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) {
 
 namespace Kokkos {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 KOKKOS_DEPRECATED void CudaSpace::access_error() {
   const std::string msg(
       "Kokkos::CudaSpace::access_error attempt to execute Cuda function from "
@@ -150,6 +133,7 @@ KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) {
       "non-Cuda space");
   Kokkos::Impl::throw_runtime_exception(msg);
 }
+#endif
 
 /*--------------------------------------------------------------------------*/
 
@@ -164,9 +148,11 @@ bool CudaUVMSpace::available() {
 
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 int CudaUVMSpace::number_of_allocations() {
   return Kokkos::Impl::num_uvm_allocations.load();
 }
+#endif
 #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
 // The purpose of the following variable is to allow a state-based choice
 // for pinning UVM allocations to the CPU. For now this is considered
@@ -204,6 +190,8 @@ CudaUVMSpace::CudaUVMSpace() : m_device(Kokkos::Cuda().cuda_device()) {}
 
 CudaHostPinnedSpace::CudaHostPinnedSpace() {}
 
+int memory_threshold_g = 40000;  // 40 kB
+
 //==============================================================================
 // <editor-fold desc="allocate()"> {{{1
 
@@ -221,7 +209,19 @@ void *CudaSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+  cudaError_t error_code;
+  if (arg_alloc_size >= memory_threshold_g) {
+    error_code = cudaMallocAsync(&ptr, arg_alloc_size, 0);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
   auto error_code = cudaMalloc(&ptr, arg_alloc_size);
+#endif
   if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
     cudaGetLastError();  // This is the only way to clear the last error, which
                          // we should do here since we're turning it into an
@@ -253,7 +253,8 @@ void *CudaUVMSpace::impl_allocate(
     const Kokkos::Tools::SpaceHandle arg_handle) const {
   void *ptr = nullptr;
 
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Pre UVM Allocation");
   if (arg_alloc_size > 0) {
     Kokkos::Impl::num_uvm_allocations++;
 
@@ -276,7 +277,8 @@ void *CudaUVMSpace::impl_allocate(
               CudaMallocManaged);
     }
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_allocate: Post UVM Allocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -337,9 +339,20 @@ void CudaSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-
   try {
-    CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#ifndef CUDART_VERSION
+#error CUDART_VERSION undefined!
+#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
+    if (arg_alloc_size >= memory_threshold_g) {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeAsync(arg_alloc_ptr, 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    } else {
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+    }
+#else
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+#endif
   } catch (...) {
   }
 }
@@ -362,7 +375,8 @@ void CudaUVMSpace::impl_deallocate(
     ,
     const size_t arg_logical_size,
     const Kokkos::Tools::SpaceHandle arg_handle) const {
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Pre UVM Deallocation");
   if (Kokkos::Profiling::profileLibraryLoaded()) {
     const size_t reported_size =
         (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size;
@@ -372,11 +386,12 @@ void CudaUVMSpace::impl_deallocate(
   try {
     if (arg_alloc_ptr != nullptr) {
       Kokkos::Impl::num_uvm_allocations--;
-      CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
     }
   } catch (...) {
   }
-  Cuda::impl_static_fence();
+  Cuda::impl_static_fence(
+      "Kokkos::CudaUVMSpace::impl_deallocate: Post UVM Deallocation");
 }
 
 void CudaHostPinnedSpace::deallocate(void *const arg_alloc_ptr,
@@ -401,7 +416,7 @@ void CudaHostPinnedSpace::impl_deallocate(
                                       reported_size);
   }
   try {
-    CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(arg_alloc_ptr));
   } catch (...) {
   }
 }
@@ -462,7 +477,7 @@ SharedAllocationRecord<Kokkos::CudaSpace, void>::attach_texture_object(
   resDesc.res.linear.sizeInBytes = alloc_size;
   resDesc.res.linear.devPtr      = alloc_ptr;
 
-  CUDA_SAFE_CALL(
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
       cudaCreateTextureObject(&tex_obj, &resDesc, &texDesc, nullptr));
 
   return tex_obj;
@@ -581,7 +596,7 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
                            bool to_device) {
   if ((ptr == nullptr) || (bytes == 0)) return;
   cudaPointerAttributes attr;
-  CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaPointerGetAttributes(&attr, ptr));
   // I measured this and it turns out prefetching towards the host slows
   // DualView syncs down. Probably because the latency is not too bad in the
   // first place for the pull down. If we want to change that provde
@@ -593,8 +608,8 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
 #endif
   if (to_device && is_managed &&
       space.cuda_device_prop().concurrentManagedAccess) {
-    CUDA_SAFE_CALL(cudaMemPrefetchAsync(ptr, bytes, space.cuda_device(),
-                                        space.cuda_stream()));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemPrefetchAsync(
+        ptr, bytes, space.cuda_device(), space.cuda_stream()));
   }
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
index 0f4259072d..993c8d1bba 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp
@@ -134,7 +134,12 @@ inline int cuda_deduce_block_size(bool early_termination,
     }
 
     if (blocks_per_sm >= min_blocks_per_sm) {
-      if (threads_per_sm >= opt_threads_per_sm) {
+      // The logic prefers smaller block sizes over larger ones to
+      // give more flexibility to the scheduler.
+      // But don't go below 128 where performance suffers significantly
+      // for simple copy/set kernels.
+      if ((threads_per_sm > opt_threads_per_sm) ||
+          ((block_size >= 128) && (threads_per_sm == opt_threads_per_sm))) {
         opt_block_size     = block_size;
         opt_threads_per_sm = threads_per_sm;
       }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
index 4759001d81..36df0d2564 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -49,13 +49,19 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 #include <iosfwd>
 
 namespace Kokkos {
 namespace Impl {
 
-void cuda_device_synchronize();
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string& name);
+void cuda_device_synchronize(const std::string& name);
+void cuda_stream_synchronize(const cudaStream_t stream,
+                             const std::string& name);
 
 void cuda_internal_error_throw(cudaError e, const char* name,
                                const char* file = nullptr, const int line = 0);
@@ -68,9 +74,24 @@ inline void cuda_internal_safe_call(cudaError e, const char* name,
   }
 }
 
-#define CUDA_SAFE_CALL(call) \
+#define KOKKOS_IMPL_CUDA_SAFE_CALL(call) \
   Kokkos::Impl::cuda_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void cuda_internal_safe_call_deprecated(cudaError e, const char* name,
+                                               const char* file = nullptr,
+                                               const int line   = 0) {
+  cuda_internal_safe_call(e, name, file, line);
+}
+
+#define CUDA_SAFE_CALL(call)                                              \
+  Kokkos::Impl::cuda_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                   __LINE__)
+
+#endif
+
 }  // namespace Impl
 
 namespace Experimental {
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
index 3de7a69916..bd514f5e88 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp
@@ -60,6 +60,7 @@
 
 #include <Kokkos_Cuda.hpp>
 #include <cuda_runtime_api.h>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
 
 namespace Kokkos {
 namespace Impl {
@@ -82,8 +83,8 @@ struct GraphImpl<Kokkos::Cuda> {
     constexpr size_t error_log_size = 256;
     cudaGraphNode_t error_node      = nullptr;
     char error_log[error_log_size];
-    CUDA_SAFE_CALL(cudaGraphInstantiate(&m_graph_exec, m_graph, &error_node,
-                                        error_log, error_log_size));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphInstantiate(
+        &m_graph_exec, m_graph, &error_node, error_log, error_log_size));
     // TODO @graphs print out errors
   }
 
@@ -107,26 +108,27 @@ struct GraphImpl<Kokkos::Cuda> {
     // TODO @graphs we need to somehow indicate the need for a fence in the
     //              destructor of the GraphImpl object (so that we don't have to
     //              just always do it)
-    m_execution_space.fence();
+    m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
     KOKKOS_EXPECTS(bool(m_graph))
     if (bool(m_graph_exec)) {
-      CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphExecDestroy(m_graph_exec));
     }
-    CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphDestroy(m_graph));
   };
 
   explicit GraphImpl(Kokkos::Cuda arg_instance)
       : m_execution_space(std::move(arg_instance)) {
-    CUDA_SAFE_CALL(cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphCreate(&m_graph, cuda_graph_flags_t{0}));
   }
 
   void add_node(std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
     // All of the predecessors are just added as normal, so all we need to
     // do here is add an empty node
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node),
-                                         m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(arg_node_ptr->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
   }
 
   template <class NodeImpl>
@@ -171,7 +173,7 @@ struct GraphImpl<Kokkos::Cuda> {
     auto /*const*/& cuda_node = arg_node_ptr->node_details_t::node;
     KOKKOS_EXPECTS(bool(cuda_node))
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphAddDependencies(m_graph, &pred_cuda_node, &cuda_node, 1));
   }
 
@@ -179,7 +181,7 @@ struct GraphImpl<Kokkos::Cuda> {
     if (!bool(m_graph_exec)) {
       _instantiate_graph();
     }
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaGraphLaunch(m_graph_exec, m_execution_space.cuda_stream()));
   }
 
@@ -192,9 +194,10 @@ struct GraphImpl<Kokkos::Cuda> {
     KOKKOS_EXPECTS(!bool(m_graph_exec))
     auto rv = std::make_shared<root_node_impl_t>(
         get_execution_space(), _graph_node_is_root_ctor_tag{});
-    CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
-                                         /* dependencies = */ nullptr,
-                                         /* numDependencies = */ 0));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaGraphAddEmptyNode(&(rv->node_details_t::node), m_graph,
+                              /* dependencies = */ nullptr,
+                              /* numDependencies = */ 0));
     KOKKOS_ENSURES(bool(rv->node_details_t::node))
     return rv;
   }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
index ec9c434fe6..c81286eb10 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp
@@ -51,6 +51,9 @@
     !(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL50) ||  \
       defined(KOKKOS_ARCH_MAXWELL52))
 #include <cuda_fp16.h>
+#include <iosfwd>  // istream & ostream for extraction and insertion ops
+#include <string>
+#include <Kokkos_NumericTraits.hpp>  // reduction_identity
 
 #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED
 // Make sure no one else tries to define half_t
@@ -127,7 +130,7 @@ KOKKOS_INLINE_FUNCTION
     std::enable_if_t<std::is_same<T, unsigned long long>::value, T>
         cast_from_half(half_t);
 
-class half_t {
+class alignas(2) half_t {
  public:
   using impl_type = Kokkos::Impl::half_impl_t::type;
 
@@ -138,6 +141,22 @@ class half_t {
   KOKKOS_FUNCTION
   half_t() : val(0.0F) {}
 
+  // Copy constructors
+  KOKKOS_DEFAULTED_FUNCTION
+  half_t(const half_t&) noexcept = default;
+
+  KOKKOS_INLINE_FUNCTION
+  half_t(const volatile half_t& rhs) {
+#ifdef __CUDA_ARCH__
+    val = rhs.val;
+#else
+    const volatile uint16_t* rv_ptr =
+        reinterpret_cast<const volatile uint16_t*>(&rhs.val);
+    const uint16_t rv_val = *rv_ptr;
+    val                   = reinterpret_cast<const impl_type&>(rv_val);
+#endif  // __CUDA_ARCH__
+  }
+
   // Don't support implicit conversion back to impl_type.
   // impl_type is a storage only type on host.
   KOKKOS_FUNCTION
@@ -219,7 +238,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = +tmp.val;
 #else
-    tmp.val   = __float2half(+__half2float(tmp.val));
+    tmp.val               = __float2half(+__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -230,7 +249,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     tmp.val = -tmp.val;
 #else
-    tmp.val   = __float2half(-__half2float(tmp.val));
+    tmp.val               = __float2half(-__half2float(tmp.val));
 #endif
     return tmp;
   }
@@ -241,7 +260,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     ++val;
 #else
-    float tmp = __half2float(val);
+    float tmp             = __half2float(val);
     ++tmp;
     val       = __float2half(tmp);
 #endif
@@ -255,7 +274,7 @@ class half_t {
 #else
     float tmp = __half2float(val);
     --tmp;
-    val = __float2half(tmp);
+    val     = __float2half(tmp);
 #endif
     return *this;
   }
@@ -290,7 +309,10 @@ class half_t {
 
   template <class T>
   KOKKOS_FUNCTION void operator=(T rhs) volatile {
-    val = cast_to_half(rhs).val;
+    impl_type new_val = cast_to_half(rhs).val;
+    volatile uint16_t* val_ptr =
+        reinterpret_cast<volatile uint16_t*>(const_cast<impl_type*>(&val));
+    *val_ptr = reinterpret_cast<uint16_t&>(new_val);
   }
 
   // Compound operators
@@ -299,30 +321,21 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val += rhs.val;
 #else
-    val = __float2half(__half2float(val) + __half2float(rhs.val));
+    val     = __float2half(__half2float(val) + __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator+=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) + rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) +
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator+=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs += tmp_rhs;
+    *this = tmp_lhs;
   }
 
-  // Compund operators: upcast overloads for +=
+  // Compound operators: upcast overloads for +=
   template <class T>
   KOKKOS_FUNCTION std::enable_if_t<
       std::is_same<T, float>::value || std::is_same<T, double>::value, T> friend
@@ -350,27 +363,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val -= rhs.val;
 #else
-    val          = __float2half(__half2float(val) - __half2float(rhs.val));
+    val     = __float2half(__half2float(val) - __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator-=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) - rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) -
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator-=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs -= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for -=
@@ -401,27 +405,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val *= rhs.val;
 #else
-    val          = __float2half(__half2float(val) * __half2float(rhs.val));
+    val     = __float2half(__half2float(val) * __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator*=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) * rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) *
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator*=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs *= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for *=
@@ -452,27 +447,18 @@ class half_t {
 #ifdef __CUDA_ARCH__
     val /= rhs.val;
 #else
-    val          = __float2half(__half2float(val) / __half2float(rhs.val));
+    val     = __float2half(__half2float(val) / __half2float(rhs.val));
 #endif
     return *this;
   }
 
   KOKKOS_FUNCTION
-  volatile half_t& operator/=(half_t rhs) volatile {
-#ifdef __CUDA_ARCH__
-    // Cuda 10 supports __half volatile stores but not volatile arithmetic
-    // operands. Cast away volatile-ness of val for arithmetic but not for store
-    // location.
-    val = const_cast<impl_type&>(val) / rhs.val;
-#else
-    // Use non-volatile val_ref to suppress:
-    // "warning: implicit dereference will not access object of type ‘volatile
-    // __half’ in statement"
-    auto val_ref = const_cast<impl_type&>(val);
-    val_ref      = __float2half(__half2float(const_cast<impl_type&>(val)) /
-                           __half2float(rhs.val));
-#endif
-    return *this;
+  void operator/=(const volatile half_t& rhs) volatile {
+    half_t tmp_rhs = rhs;
+    half_t tmp_lhs = *this;
+
+    tmp_lhs /= tmp_rhs;
+    *this = tmp_lhs;
   }
 
   // Compund operators: upcast overloads for /=
@@ -504,7 +490,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val += rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -529,7 +515,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val -= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -554,7 +540,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val *= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -579,7 +565,7 @@ class half_t {
 #ifdef __CUDA_ARCH__
     lhs.val /= rhs.val;
 #else
-    lhs.val      = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
+    lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val));
 #endif
     return lhs;
   }
@@ -683,6 +669,62 @@ class half_t {
     return __half2float(val) >= __half2float(rhs.val);
 #endif
   }
+
+  KOKKOS_FUNCTION
+  friend bool operator==(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs == tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator!=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs != tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs < tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>(const volatile half_t& lhs,
+                        const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs > tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator<=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs <= tmp_rhs;
+  }
+
+  KOKKOS_FUNCTION
+  friend bool operator>=(const volatile half_t& lhs,
+                         const volatile half_t& rhs) {
+    half_t tmp_lhs = lhs, tmp_rhs = rhs;
+    return tmp_lhs >= tmp_rhs;
+  }
+
+  // Insertion and extraction operators
+  friend std::ostream& operator<<(std::ostream& os, const half_t& x) {
+    const std::string out = std::to_string(static_cast<double>(x));
+    os << out;
+    return os;
+  }
+
+  friend std::istream& operator>>(std::istream& is, half_t& x) {
+    std::string in;
+    is >> in;
+    x = std::stod(in);
+    return is;
+  }
 };
 
 // CUDA before 11.1 only has the half <-> float conversions marked host device
@@ -943,6 +985,25 @@ KOKKOS_INLINE_FUNCTION
 }
 #endif
 }  // namespace Experimental
+
+// use float as the return type for sum and prod since cuda_fp16.h
+// has no constexpr functions for casting to __half
+template <>
+struct reduction_identity<Kokkos::Experimental::half_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() noexcept {
+    return 0.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() noexcept {
+    return 1.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() noexcept {
+    return -65504.0F;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() noexcept {
+    return 65504.0F;
+  }
+};
+
 }  // namespace Kokkos
 #endif  // KOKKOS_IMPL_HALF_TYPE_DEFINED
 #endif  // KOKKOS_ENABLE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
index 016cb6cdcb..6964d5b41b 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -119,7 +119,7 @@ int cuda_kernel_arch() {
   int arch    = 0;
   int *d_arch = nullptr;
 
-  cudaMalloc((void **)&d_arch, sizeof(int));
+  cudaMalloc(reinterpret_cast<void **>(&d_arch), sizeof(int));
   cudaMemcpy(d_arch, &arch, sizeof(int), cudaMemcpyDefault);
 
   query_cuda_kernel_arch<<<1, 1>>>(d_arch);
@@ -141,7 +141,36 @@ bool cuda_launch_blocking() {
 
 }  // namespace
 
-void cuda_device_synchronize() { CUDA_SAFE_CALL(cudaDeviceSynchronize()); }
+void cuda_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
+      });
+}
+
+void cuda_stream_synchronize(const cudaStream_t stream, const CudaInternal *ptr,
+                             const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          ptr->impl_get_instance_id()},
+      [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
+
+void cuda_stream_synchronize(
+    const cudaStream_t stream,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Cuda>(
+      name, reason, [&]() {  // TODO: correct device ID
+        KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamSynchronize(stream));
+      });
+}
 
 void cuda_internal_error_throw(cudaError e, const char *name, const char *file,
                                const int line) {
@@ -221,7 +250,7 @@ CudaInternalDevices::CudaInternalDevices() {
   // See 'cudaSetDeviceFlags' for host-device thread interaction
   // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
 
-  CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&m_cudaDevCount));
 
   if (m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -229,7 +258,7 @@ CudaInternalDevices::CudaInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_cudaDevCount; ++i) {
-    CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(m_cudaProp + i, i));
   }
 }
 
@@ -277,25 +306,27 @@ CudaInternal::~CudaInternal() {
               << std::endl;
   }
 
-  m_cudaDev                   = -1;
-  m_cudaArch                  = -1;
-  m_multiProcCount            = 0;
-  m_maxWarpCount              = 0;
-  m_maxBlock                  = 0;
-  m_maxSharedWords            = 0;
-  m_maxConcurrency            = 0;
-  m_scratchSpaceCount         = 0;
-  m_scratchFlagsCount         = 0;
-  m_scratchUnifiedCount       = 0;
-  m_scratchUnifiedSupported   = 0;
-  m_streamCount               = 0;
-  m_scratchSpace              = nullptr;
-  m_scratchFlags              = nullptr;
-  m_scratchUnified            = nullptr;
-  m_scratchConcurrentBitset   = nullptr;
-  m_stream                    = nullptr;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_cudaDev                 = -1;
+  m_cudaArch                = -1;
+  m_multiProcCount          = 0;
+  m_maxWarpCount            = 0;
+  m_maxBlock                = 0;
+  m_maxSharedWords          = 0;
+  m_maxConcurrency          = 0;
+  m_scratchSpaceCount       = 0;
+  m_scratchFlagsCount       = 0;
+  m_scratchUnifiedCount     = 0;
+  m_scratchUnifiedSupported = 0;
+  m_streamCount             = 0;
+  m_scratchSpace            = nullptr;
+  m_scratchFlags            = nullptr;
+  m_scratchUnified          = nullptr;
+  m_scratchConcurrentBitset = nullptr;
+  m_stream                  = nullptr;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 int CudaInternal::verify_is_initialized(const char *const label) const {
@@ -305,16 +336,20 @@ int CudaInternal::verify_is_initialized(const char *const label) const {
   }
   return 0 <= m_cudaDev;
 }
-
+uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; }
 CudaInternal &CudaInternal::singleton() {
   static CudaInternal self;
   return self;
 }
+void CudaInternal::fence(const std::string &name) const {
+  Impl::cuda_stream_synchronize(m_stream, this, name);
+}
 void CudaInternal::fence() const {
-  CUDA_SAFE_CALL(cudaStreamSynchronize(m_stream));
+  fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence");
 }
 
-void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
+void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream,
+                              bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
   was_initialized = true;
@@ -350,8 +385,9 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
     m_cudaDev    = cuda_device_id;
     m_deviceProp = cudaProp;
 
-    CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
-    Kokkos::Impl::cuda_device_synchronize();
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev));
+    Kokkos::Impl::cuda_device_synchronize(
+        "Kokkos::CudaInternal::initialize: Fence on space initialization");
 
     // Query what compute capability architecture a kernel executes:
     m_cudaArch = cuda_kernel_arch();
@@ -464,8 +500,8 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
-                                sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemset(m_scratchConcurrentBitset, 0,
+                                            sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -535,15 +571,19 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
   // Allocate a staging buffer for constant mem in pinned host memory
   // and an event to avoid overwriting driver for previous kernel launches
   if (stream == nullptr) {
-    CUDA_SAFE_CALL(cudaMallocHost((void **)&constantMemHostStaging,
-                                  CudaTraits::ConstantMemoryUsage));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMallocHost(reinterpret_cast<void **>(&constantMemHostStaging),
+                       CudaTraits::ConstantMemoryUsage));
 
-    CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventCreate(&constantMemReusable));
   }
 
-  m_stream                    = stream;
-  m_team_scratch_current_size = 0;
-  m_team_scratch_ptr          = nullptr;
+  m_stream        = stream;
+  m_manage_stream = manage_stream;
+  for (int i = 0; i < m_n_team_scratch; ++i) {
+    m_team_scratch_current_size[i] = 0;
+    m_team_scratch_ptr[i]          = nullptr;
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -569,7 +609,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const {
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -645,20 +685,37 @@ Cuda::size_type *CudaInternal::scratch_functor(
   return m_scratchFunctor;
 }
 
-void *CudaInternal::resize_team_scratch_space(std::int64_t bytes,
-                                              bool force_shrink) {
-  if (m_team_scratch_current_size == 0) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
-        "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size);
+std::pair<void *, int> CudaInternal::resize_team_scratch_space(
+    std::int64_t bytes, bool force_shrink) {
+  // Multiple ParallelFor/Reduce Teams can call this function at the same time
+  // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race
+  // condition.
+
+  int current_team_scratch = 0;
+  int zero                 = 0;
+  int one                  = 1;
+  while (m_team_scratch_pool[current_team_scratch].compare_exchange_weak(
+      zero, one, std::memory_order_release, std::memory_order_relaxed)) {
+    current_team_scratch = (current_team_scratch + 1) % m_n_team_scratch;
   }
-  if ((bytes > m_team_scratch_current_size) ||
-      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
-    m_team_scratch_current_size = bytes;
-    m_team_scratch_ptr          = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
-        m_team_scratch_ptr, m_team_scratch_current_size);
+  if (m_team_scratch_current_size[current_team_scratch] == 0) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_malloc<Kokkos::CudaSpace>(
+            "Kokkos::CudaSpace::TeamScratchMemory",
+            m_team_scratch_current_size[current_team_scratch]);
   }
-  return m_team_scratch_ptr;
+  if ((bytes > m_team_scratch_current_size[current_team_scratch]) ||
+      ((bytes < m_team_scratch_current_size[current_team_scratch]) &&
+       (force_shrink))) {
+    m_team_scratch_current_size[current_team_scratch] = bytes;
+    m_team_scratch_ptr[current_team_scratch] =
+        Kokkos::kokkos_realloc<Kokkos::CudaSpace>(
+            m_team_scratch_ptr[current_team_scratch],
+            m_team_scratch_current_size[current_team_scratch]);
+  }
+  return std::make_pair(m_team_scratch_ptr[current_team_scratch],
+                        current_team_scratch);
 }
 
 //----------------------------------------------------------------------------
@@ -685,36 +742,43 @@ void CudaInternal::finalize() {
     if (m_scratchFunctorSize > 0)
       RecordCuda::decrement(RecordCuda::get_record(m_scratchFunctor));
 
-    if (m_team_scratch_current_size > 0)
-      Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr);
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      if (m_team_scratch_current_size[i] > 0)
+        Kokkos::kokkos_free<Kokkos::CudaSpace>(m_team_scratch_ptr[i]);
+    }
 
-    m_cudaDev                   = -1;
-    m_multiProcCount            = 0;
-    m_maxWarpCount              = 0;
-    m_maxBlock                  = 0;
-    m_maxSharedWords            = 0;
-    m_scratchSpaceCount         = 0;
-    m_scratchFlagsCount         = 0;
-    m_scratchUnifiedCount       = 0;
-    m_streamCount               = 0;
-    m_scratchSpace              = nullptr;
-    m_scratchFlags              = nullptr;
-    m_scratchUnified            = nullptr;
-    m_scratchConcurrentBitset   = nullptr;
-    m_stream                    = nullptr;
-    m_team_scratch_current_size = 0;
-    m_team_scratch_ptr          = nullptr;
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(m_stream));
+
+    m_cudaDev                 = -1;
+    m_multiProcCount          = 0;
+    m_maxWarpCount            = 0;
+    m_maxBlock                = 0;
+    m_maxSharedWords          = 0;
+    m_scratchSpaceCount       = 0;
+    m_scratchFlagsCount       = 0;
+    m_scratchUnifiedCount     = 0;
+    m_streamCount             = 0;
+    m_scratchSpace            = nullptr;
+    m_scratchFlags            = nullptr;
+    m_scratchUnified          = nullptr;
+    m_scratchConcurrentBitset = nullptr;
+    m_stream                  = nullptr;
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+    }
   }
 
   // only destroy these if we're finalizing the singleton
   if (this == &singleton()) {
-    cudaFreeHost(constantMemHostStaging);
-    cudaEventDestroy(constantMemReusable);
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFreeHost(constantMemHostStaging));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaEventDestroy(constantMemReusable));
     auto &deep_copy_space =
         Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false);
     if (deep_copy_space)
       deep_copy_space->impl_internal_space_instance()->finalize();
-    cudaStreamDestroy(cuda_get_deep_copy_stream());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(cuda_get_deep_copy_stream()));
   }
 }
 
@@ -823,7 +887,7 @@ Cuda::Cuda()
       "Cuda instance constructor");
 }
 
-Cuda::Cuda(cudaStream_t stream)
+Cuda::Cuda(cudaStream_t stream, bool manage_stream)
     : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) {
         ptr->finalize();
         delete ptr;
@@ -831,18 +895,31 @@ Cuda::Cuda(cudaStream_t stream)
   Impl::CudaInternal::singleton().verify_is_initialized(
       "Cuda instance constructor");
   m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev,
-                               stream);
+                               stream, manage_stream);
 }
 
 void Cuda::print_configuration(std::ostream &s, const bool) {
   Impl::CudaInternal::singleton().print_configuration(s);
 }
 
-void Cuda::impl_static_fence() { Kokkos::Impl::cuda_device_synchronize(); }
+void Cuda::impl_static_fence(const std::string &name) {
+  Kokkos::Impl::cuda_device_synchronize(name);
+}
+void Cuda::impl_static_fence() {
+  impl_static_fence("Kokkos::Cuda::impl_static_fence(): Unnamed Static Fence");
+}
 
-void Cuda::fence() const { m_space_instance->fence(); }
+void Cuda::fence() const {
+  fence("Kokkos::Cuda::fence(): Unnamed Instance Fence");
+}
+void Cuda::fence(const std::string &name) const {
+  m_space_instance->fence(name);
+}
 
 const char *Cuda::name() { return "Cuda"; }
+uint32_t Cuda::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
 
 cudaStream_t Cuda::cuda_stream() const { return m_space_instance->m_stream; }
 int Cuda::cuda_device() const { return m_space_instance->m_cudaDev; }
@@ -877,7 +954,15 @@ void CudaSpaceInitializer::finalize(bool all_spaces) {
   }
 }
 
-void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); }
+void CudaSpaceInitializer::fence() {
+  Kokkos::Cuda::impl_static_fence(
+      "Kokkos::CudaSpaceInitializer::fence: Initializer Fence");
+}
+void CudaSpaceInitializer::fence(const std::string &name) {
+  // Kokkos::Cuda::impl_static_fence("Kokkos::CudaSpaceInitializer::fence:
+  // "+name); //TODO: or this
+  Kokkos::Cuda::impl_static_fence(name);
+}
 
 void CudaSpaceInitializer::print_configuration(std::ostream &msg,
                                                const bool detail) {
@@ -916,12 +1001,6 @@ void CudaSpaceInitializer::print_configuration(std::ostream &msg,
   msg << "yes\n";
 #else
   msg << "no\n";
-#endif
-  msg << "  KOKKOS_ENABLE_CUSPARSE: ";
-#ifdef KOKKOS_ENABLE_CUSPARSE
-  msg << "yes\n";
-#else
-  msg << "no\n";
 #endif
   msg << "  KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
 #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
index aaec2c2926..7eb169838c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -3,6 +3,9 @@
 
 #include <vector>
 #include <impl/Kokkos_Tools.hpp>
+#include <atomic>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 // These functions fulfill the purpose of allowing to work around
@@ -114,10 +117,14 @@ class CudaInternal {
   mutable size_type* m_scratchFunctor;
   uint32_t* m_scratchConcurrentBitset;
   cudaStream_t m_stream;
+  uint32_t m_instance_id;
+  bool m_manage_stream;
 
   // Team Scratch Level 1 Space
-  mutable int64_t m_team_scratch_current_size;
-  mutable void* m_team_scratch_ptr;
+  int m_n_team_scratch = 10;
+  mutable int64_t m_team_scratch_current_size[10];
+  mutable void* m_team_scratch_ptr[10];
+  mutable std::atomic_int m_team_scratch_pool[10];
 
   bool was_initialized = false;
   bool was_finalized   = false;
@@ -135,7 +142,8 @@ class CudaInternal {
     return nullptr != m_scratchSpace && nullptr != m_scratchFlags;
   }
 
-  void initialize(int cuda_device_id, cudaStream_t stream = nullptr);
+  void initialize(int cuda_device_id, cudaStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream&) const;
@@ -145,6 +153,7 @@ class CudaInternal {
   static void cuda_set_serial_execution(bool);
 #endif
 
+  void fence(const std::string&) const;
   void fence() const;
 
   ~CudaInternal();
@@ -175,20 +184,68 @@ class CudaInternal {
         m_scratchFunctor(nullptr),
         m_scratchConcurrentBitset(nullptr),
         m_stream(nullptr),
-        m_team_scratch_current_size(0),
-        m_team_scratch_ptr(nullptr) {}
+        m_instance_id(
+            Kokkos::Tools::Experimental::Impl::idForInstance<Kokkos::Cuda>(
+                reinterpret_cast<uintptr_t>(this))) {
+    for (int i = 0; i < m_n_team_scratch; ++i) {
+      m_team_scratch_current_size[i] = 0;
+      m_team_scratch_ptr[i]          = nullptr;
+      m_team_scratch_pool[i]         = 0;
+    }
+  }
 
   // Resizing of reduction related scratch spaces
   size_type* scratch_space(const size_type size) const;
   size_type* scratch_flags(const size_type size) const;
   size_type* scratch_unified(const size_type size) const;
   size_type* scratch_functor(const size_type size) const;
-
+  uint32_t impl_get_instance_id() const;
   // Resizing of team level 1 scratch
-  void* resize_team_scratch_space(std::int64_t bytes,
-                                  bool force_shrink = false);
+  std::pair<void*, int> resize_team_scratch_space(std::int64_t bytes,
+                                                  bool force_shrink = false);
 };
 
 }  // Namespace Impl
+
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_Cuda_instances(std::vector<Cuda>& instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    cudaStream_t stream;
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream));
+    instances[s] = Cuda(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<Cuda> partition_space(const Cuda&, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<Cuda> instances(sizeof...(Args));
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<Cuda> partition_space(const Cuda&, std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<Cuda> instances(weights.size());
+  Impl::create_Cuda_instances(instances);
+  return instances;
+}
+}  // namespace Experimental
+
 }  // Namespace Kokkos
 #endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
index d892a893b3..4b01798f5e 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp
@@ -167,7 +167,7 @@ inline void configure_shmem_preference(KernelFuncPtr const& func,
 #ifndef KOKKOS_ARCH_KEPLER
   // On Kepler the L1 has no benefit since it doesn't cache reads
   auto set_cache_config = [&] {
-    CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFuncSetCacheConfig(
         func,
         (prefer_shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1)));
     return prefer_shmem;
@@ -372,14 +372,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -475,14 +476,15 @@ struct CudaParallelLaunchKernelInvoker<
       params.kernelParams   = (void**)args;
       params.extra          = nullptr;
 
-      CUDA_SAFE_CALL(cudaGraphAddKernelNode(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGraphAddKernelNode(
           &graph_node, graph, /* dependencies = */ nullptr,
           /* numDependencies = */ 0, &params));
     } else {
       // We still need an empty node for the dependency structure
-      CUDA_SAFE_CALL(cudaGraphAddEmptyNode(&graph_node, graph,
-                                           /* dependencies = */ nullptr,
-                                           /* numDependencies = */ 0));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaGraphAddEmptyNode(&graph_node, graph,
+                                /* dependencies = */ nullptr,
+                                /* numDependencies = */ 0));
     }
     KOKKOS_ENSURES(bool(graph_node))
   }
@@ -538,7 +540,8 @@ struct CudaParallelLaunchKernelInvoker<
                             dim3 const& block, int shmem,
                             CudaInternal const* cuda_instance) {
     // Wait until the previous kernel that uses the constant buffer is done
-    CUDA_SAFE_CALL(cudaEventSynchronize(cuda_instance->constantMemReusable));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventSynchronize(cuda_instance->constantMemReusable));
 
     // Copy functor (synchronously) to staging buffer in pinned host memory
     unsigned long* staging = cuda_instance->constantMemHostStaging;
@@ -554,8 +557,9 @@ struct CudaParallelLaunchKernelInvoker<
          get_kernel_func())<<<grid, block, shmem, cuda_instance->m_stream>>>();
 
     // Record an event that says when the constant buffer can be reused
-    CUDA_SAFE_CALL(cudaEventRecord(cuda_instance->constantMemReusable,
-                                   cudaStream_t(cuda_instance->m_stream)));
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaEventRecord(cuda_instance->constantMemReusable,
+                        cudaStream_t(cuda_instance->m_stream)));
   }
 
 #ifdef KOKKOS_CUDA_ENABLE_GRAPHS
@@ -637,8 +641,9 @@ struct CudaParallelLaunchImpl<
       base_t::invoke_kernel(driver, grid, block, shmem, cuda_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      CUDA_SAFE_CALL(cudaGetLastError());
-      cuda_instance->fence();
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+      cuda_instance->fence(
+          "Kokkos::Impl::launch_kernel: Debug Only Check for Execution Error");
 #endif
     }
   }
@@ -650,7 +655,7 @@ struct CudaParallelLaunchImpl<
     // the code and the result is visible.
     auto wrap_get_attributes = []() -> cudaFuncAttributes {
       cudaFuncAttributes attr_tmp;
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaFuncGetAttributes(&attr_tmp, base_t::get_kernel_func()));
       return attr_tmp;
     };
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
index ff31649544..1f3024f318 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.cpp
@@ -81,22 +81,34 @@ namespace Impl {
 CudaLockArrays g_host_cuda_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
+#endif
   if (g_host_cuda_lock_arrays.atomic != nullptr) return;
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
-                            sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
-  CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
-                            sizeof(int) * (Cuda::concurrency())));
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(
+      cudaMalloc(&g_host_cuda_lock_arrays.atomic,
+                 sizeof(int) * (CUDA_SPACE_ATOMIC_MASK + 1)));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
+                                        sizeof(int) * (Cuda::concurrency())));
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Pre Init Lock Arrays");
   g_host_cuda_lock_arrays.n = Cuda::concurrency();
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
   init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256,
                                   256>>>();
   init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency() + 255) / 256,
                                     256>>>(Kokkos::Cuda::concurrency());
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  Impl::cuda_device_synchronize(
+      "Kokkos::Impl::initialize_host_cuda_lock_arrays: Post Init Lock Arrays");
 }
 
 void finalize_host_cuda_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_cuda_lock_arrays.atomic == nullptr) return;
   cudaFree(g_host_cuda_lock_arrays.atomic);
   g_host_cuda_lock_arrays.atomic = nullptr;
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
index 7640b8084d..04fb7cb345 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@@ -53,6 +53,10 @@
 
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_Cuda.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -150,13 +154,14 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
 }  // namespace Impl
 }  // namespace Kokkos
+
 /* Dan Ibanez: it is critical that this code be a macro, so that it will
    capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
    putting this in an inline function will NOT do the right thing! */
 #define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                      \
   {                                                                   \
     if (::Kokkos::Impl::lock_array_copied == 0) {                     \
-      CUDA_SAFE_CALL(                                                 \
+      KOKKOS_IMPL_CUDA_SAFE_CALL(                                     \
           cudaMemcpyToSymbol(Kokkos::Impl::g_device_cuda_lock_arrays, \
                              &Kokkos::Impl::g_host_cuda_lock_arrays,  \
                              sizeof(Kokkos::Impl::CudaLockArrays)));  \
@@ -164,6 +169,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     lock_array_copied = 1;                                            \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -171,6 +178,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( KOKKOS_ENABLE_CUDA ) */
 
 #endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
index 2834e6f3de..f83b43e608 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -62,7 +62,6 @@
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_Vectorization.hpp>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
 #include <typeinfo>
@@ -240,9 +239,11 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 
   //----------------------------------------
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int impl_vector_length() const { return m_vector_length; }
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
@@ -687,6 +688,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
 
   template <class TagType>
   __device__ inline
@@ -797,15 +799,19 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
     m_scratch_ptr[0] = nullptr;
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     const int shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -829,6 +835,14 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
           "Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
     }
   }
+
+  ~ParallelFor() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -870,9 +884,24 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   using value_type     = typename ValueTraits::value_type;
   using reference_type = typename ValueTraits::reference_type;
   using functor_type   = FunctorType;
-  using size_type      = Kokkos::Cuda::size_type;
-  using index_type     = typename Policy::index_type;
-  using reducer_type   = ReducerType;
+  // Conditionally set word_size_type to int16_t or int8_t if value_type is
+  // smaller than int32_t (Kokkos::Cuda::size_type)
+  // word_size_type is used to determine the word count, shared memory buffer
+  // size, and global memory buffer size before the reduction is performed.
+  // Within the reduction, the word count is recomputed based on word_size_type
+  // and when calculating indexes into the shared/global memory buffers for
+  // performing the reduction, word_size_type is used again.
+  // For scalars > 4 bytes in size, indexing into shared/global memory relies
+  // on the block and grid dimensions to ensure that we index at the correct
+  // offset rather than at every 4 byte word; such that, when the join is
+  // performed, we have the correct data that was copied over in chunks of 4
+  // bytes.
+  using word_size_type = typename std::conditional<
+      sizeof(value_type) < sizeof(Kokkos::Cuda::size_type),
+      typename std::conditional<sizeof(value_type) == 2, int16_t, int8_t>::type,
+      Kokkos::Cuda::size_type>::type;
+  using index_type   = typename Policy::index_type;
+  using reducer_type = ReducerType;
 
   // Algorithmic constraints: blockSize is a power of two AND blockDim.y ==
   // blockDim.z == 1
@@ -883,9 +912,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const pointer_type m_result_ptr;
   const bool m_result_ptr_device_accessible;
   const bool m_result_ptr_host_accessible;
-  size_type* m_scratch_space;
-  size_type* m_scratch_flags;
-  size_type* m_unified_space;
+  word_size_type* m_scratch_space;
+  // m_scratch_flags must be of type Cuda::size_type due to use of atomics
+  // for tracking metadata in Kokkos_Cuda_ReduceScan.hpp
+  Cuda::size_type* m_scratch_flags;
+  word_size_type* m_unified_space;
 
   // Shall we use the shfl based reduction or not (only use it for static sized
   // types of more than 128bit)
@@ -924,16 +955,16 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       __device__ inline
       void run(const DummySHMEMReductionType& ) const
       {*/
-    const integral_nonzero_constant<size_type, ValueTraits::StaticValueSize /
-                                                   sizeof(size_type)>
+    const integral_nonzero_constant<
+        word_size_type, ValueTraits::StaticValueSize / sizeof(word_size_type)>
         word_count(ValueTraits::value_size(
                        ReducerConditional::select(m_functor, m_reducer)) /
-                   sizeof(size_type));
+                   sizeof(word_size_type));
 
     {
       reference_type value =
           ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
-                          kokkos_impl_cuda_shared_memory<size_type>() +
+                          kokkos_impl_cuda_shared_memory<word_size_type>() +
                               threadIdx.y * word_count.value);
 
       // Number of blocks is bounded so that the reduction can be limited to two
@@ -958,11 +989,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       // This is the final block with the final result at the final threads'
       // location
 
-      size_type* const shared = kokkos_impl_cuda_shared_memory<size_type>() +
-                                (blockDim.y - 1) * word_count.value;
-      size_type* const global =
+      word_size_type* const shared =
+          kokkos_impl_cuda_shared_memory<word_size_type>() +
+          (blockDim.y - 1) * word_count.value;
+      word_size_type* const global =
           m_result_ptr_device_accessible
-              ? reinterpret_cast<size_type*>(m_result_ptr)
+              ? reinterpret_cast<word_size_type*>(m_result_ptr)
               : (m_unified_space ? m_unified_space : m_scratch_space);
 
       if (threadIdx.y == 0) {
@@ -985,17 +1017,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         if (cuda_single_inter_block_reduce_scan<false, ReducerTypeFwd,
                                                 WorkTagFwd>(
                 ReducerConditional::select(m_functor, m_reducer), blockIdx.x,
-                gridDim.x, kokkos_impl_cuda_shared_memory<size_type>(),
+                gridDim.x, kokkos_impl_cuda_shared_memory<word_size_type>(),
                 m_scratch_space, m_scratch_flags)) {
           // This is the final block with the final result at the final threads'
           // location
 
-          size_type* const shared =
-              kokkos_impl_cuda_shared_memory<size_type>() +
+          word_size_type* const shared =
+              kokkos_impl_cuda_shared_memory<word_size_type>() +
               (blockDim.y - 1) * word_count.value;
-          size_type* const global =
+          word_size_type* const global =
               m_result_ptr_device_accessible
-                  ? reinterpret_cast<size_type*>(m_result_ptr)
+                  ? reinterpret_cast<word_size_type*>(m_result_ptr)
                   : (m_unified_space ? m_unified_space : m_scratch_space);
 
           if (threadIdx.y == 0) {
@@ -1100,15 +1132,21 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
       KOKKOS_ASSERT(block_size > 0);
 
-      m_scratch_space = cuda_internal_scratch_space(
+      // TODO: down casting these uses more space than required?
+      m_scratch_space = (word_size_type*)cuda_internal_scratch_space(
           m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
                                 m_functor, m_reducer)) *
                                 block_size /* block_size == max block_count */);
-      m_scratch_flags =
-          cuda_internal_scratch_flags(m_policy.space(), sizeof(size_type));
-      m_unified_space = cuda_internal_scratch_unified(
-          m_policy.space(), ValueTraits::value_size(ReducerConditional::select(
-                                m_functor, m_reducer)));
+
+      // Intentionally do not downcast to word_size_type since we use Cuda
+      // atomics in Kokkos_Cuda_ReduceScan.hpp
+      m_scratch_flags = cuda_internal_scratch_flags(m_policy.space(),
+                                                    sizeof(Cuda::size_type));
+      m_unified_space =
+          reinterpret_cast<word_size_type*>(cuda_internal_scratch_unified(
+              m_policy.space(),
+              ValueTraits::value_size(
+                  ReducerConditional::select(m_functor, m_reducer))));
 
       // REQUIRED ( 1 , N , 1 )
       dim3 block(1, block_size, 1);
@@ -1139,7 +1177,9 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, RangePolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1459,7 +1499,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, MDRangePolicy>::execute: "
+            "Result Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1580,6 +1622,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_type m_shmem_size;
   void* m_scratch_ptr[2];
   int m_scratch_size[2];
+  int m_scratch_pool_id = -1;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -1821,7 +1864,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
           true);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<Cuda, TeamPolicy>::execute: Result "
+            "Not Device Accessible");
 
         if (m_result_ptr) {
           if (m_unified_space) {
@@ -1895,16 +1940,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<std::int64_t>(m_scratch_size[1]) *
-                      (static_cast<std::int64_t>(
-                          Cuda::concurrency() /
-                          (m_team_size * m_vector_size))));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -1973,6 +2021,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     cudaFuncAttributes attr =
         CudaParallelLaunch<ParallelReduce,
                            LaunchBounds>::get_cuda_func_attributes();
+
+    // Valid team size not provided, deduce team size
     m_team_size =
         m_team_size >= 0
             ? m_team_size
@@ -1994,15 +2044,19 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
     m_scratch_size[0] = m_shmem_size;
     m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_ptr[1] =
-        m_team_size <= 0
-            ? nullptr
-            : m_policy.space()
-                  .impl_internal_space_instance()
-                  ->resize_team_scratch_space(
-                      static_cast<ptrdiff_t>(m_scratch_size[1]) *
-                      static_cast<ptrdiff_t>(Cuda::concurrency() /
-                                             (m_team_size * m_vector_size)));
+    if (m_team_size <= 0) {
+      m_scratch_ptr[1] = nullptr;
+    } else {
+      auto scratch_ptr_id =
+          m_policy.space()
+              .impl_internal_space_instance()
+              ->resize_team_scratch_space(
+                  static_cast<std::int64_t>(m_scratch_size[1]) *
+                  (static_cast<std::int64_t>(Cuda::concurrency() /
+                                             (m_team_size * m_vector_size))));
+      m_scratch_ptr[1]  = scratch_ptr_id.first;
+      m_scratch_pool_id = scratch_ptr_id.second;
+    }
 
     // The global parallel_reduce does not support vector_length other than 1 at
     // the moment
@@ -2030,13 +2084,28 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
     }
-    if (int(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_type team_size_max =
+        Kokkos::Impl::cuda_get_max_block_size<FunctorType, LaunchBounds>(
+            m_policy.space().impl_internal_space_instance(), attr, m_functor,
+            m_vector_size, m_policy.team_scratch_size(0),
+            m_policy.thread_scratch_size(0)) /
+        m_vector_size;
+
+    if ((int)m_team_size > (int)team_size_max) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too "
                       "large team size."));
     }
   }
+
+  ~ParallelReduce() {
+    if (m_scratch_pool_id >= 0) {
+      m_policy.space()
+          .impl_internal_space_instance()
+          ->m_team_scratch_pool[m_scratch_pool_id] = 0;
+    }
+  }
 };
 
 }  // namespace Impl
@@ -2167,9 +2236,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK                            = __activemask();
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
       __syncthreads();  // Don't overwrite previous iteration values until they
@@ -2182,11 +2249,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>, Kokkos::Cuda> {
       for (unsigned i = threadIdx.y; i < word_count.value; ++i) {
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
@@ -2457,9 +2520,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     for (typename Policy::member_type iwork_base = range.begin();
          iwork_base < range.end(); iwork_base += blockDim.y) {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned MASK = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-#endif
+      unsigned MASK = __activemask();
 
       const typename Policy::member_type iwork = iwork_base + threadIdx.y;
 
@@ -2474,11 +2535,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
         shared_data[i + word_count.value] = shared_data[i] = shared_accum[i];
       }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(MASK);
       if (CudaTraits::WarpSize < word_count.value) {
         __syncthreads();
       }  // Protect against large scan values.
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
index fc9fc3770b..e5b05bcc64 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -191,48 +191,28 @@ __device__ bool cuda_inter_block_reduction(
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
   // The last block has in its thread=0 the global reduction value through
@@ -388,48 +368,28 @@ __device__ inline
         value_type tmp = Kokkos::shfl_down(value, 1, 32);
         if (id + 1 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-      int active        = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      unsigned int mask = __activemask();
+      int active        = __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 2) {
         value_type tmp = Kokkos::shfl_down(value, 2, 32);
         if (id + 2 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 4) {
         value_type tmp = Kokkos::shfl_down(value, 4, 32);
         if (id + 4 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 8) {
         value_type tmp = Kokkos::shfl_down(value, 8, 32);
         if (id + 8 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
       if (int(blockDim.x * blockDim.y) > 16) {
         value_type tmp = Kokkos::shfl_down(value, 16, 32);
         if (id + 16 < int(gridDim.x)) reducer.join(value, tmp);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+      active += __ballot_sync(mask, 1);
     }
   }
 
@@ -573,23 +533,17 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
                                // part of the reduction
       const int width)         // How much of the warp participates
   {
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
     unsigned mask =
         width == 32
             ? 0xffffffff
             : ((1 << width) - 1)
                   << ((threadIdx.y * blockDim.x + threadIdx.x) / width) * width;
-#endif
     const int lane_id = (threadIdx.y * blockDim.x + threadIdx.x) % 32;
     for (int delta = skip_vector ? blockDim.x : 1; delta < width; delta *= 2) {
       if (lane_id + delta < 32) {
         ValueJoin::join(functor, value, value + delta);
       }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
-#else
-      KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+      __syncwarp(mask);
     }
     *value = *(value - lane_id);
   }
@@ -612,17 +566,18 @@ struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
       const unsigned int delta = (threadIdx.y * blockDim.x + threadIdx.x) * 32;
       if (delta < blockDim.x * blockDim.y)
         *my_shared_team_buffer_element = shared_team_buffer_element[delta];
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
       scalar_intra_warp_reduction(functor, my_shared_team_buffer_element, false,
                                   blockDim.x * blockDim.y / 32);
       if (threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
     }
   }
 
+  template <class SizeType = Cuda::size_type>
   __device__ static inline bool scalar_inter_block_reduction(
       const FunctorType& functor, const Cuda::size_type /*block_id*/,
-      const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-      Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+      const Cuda::size_type block_count, SizeType* const shared_data,
+      SizeType* const global_data, Cuda::size_type* const global_flags) {
     Scalar* const global_team_buffer_element = ((Scalar*)global_data);
     Scalar* const my_global_team_buffer_element =
         global_team_buffer_element + blockIdx.x;
@@ -713,17 +668,17 @@ __device__ void cuda_intra_block_reduce_scan(
   const pointer_type tdata_intra = base_data + value_count * threadIdx.y;
 
   {  // Intra-warp reduction:
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 0)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 1)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 2)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 3)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_REDUCE_STEP(rtid_intra, tdata_intra, 4)
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
   __syncthreads();  // Wait for all warps to reduce
@@ -732,57 +687,31 @@ __device__ void cuda_intra_block_reduce_scan(
     const unsigned rtid_inter = (threadIdx.y ^ BlockSizeMask)
                                 << CudaTraits::WarpIndexShift;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    unsigned inner_mask =
-        KOKKOS_IMPL_CUDA_BALLOT_MASK(0xffffffff, (rtid_inter < blockDim.y));
-#endif
+    unsigned inner_mask = __ballot_sync(0xffffffff, (rtid_inter < blockDim.y));
     if (rtid_inter < blockDim.y) {
       const pointer_type tdata_inter =
           base_data + value_count * (rtid_inter ^ BlockSizeMask);
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
       if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
       }
       if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
       }
       if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
       }
       if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
       }
       if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
       }
-#else
-      if ((1 << 5) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 5)
-      }
-      if ((1 << 6) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 6)
-      }
-      if ((1 << 7) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 7)
-      }
-      if ((1 << 8) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 8)
-      }
-      if ((1 << 9) < BlockSizeMask) {
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_REDUCE_STEP(rtid_inter, tdata_inter, 9)
-      }
-#endif
 
       if (DoScan) {
         int n =
@@ -795,25 +724,14 @@ __device__ void cuda_intra_block_reduce_scan(
 
         if (!(rtid_inter + n < blockDim.y)) n = 0;
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP_MASK(inner_mask);
+        __syncwarp(inner_mask);
         BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#else
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 8)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 7)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 6)
-        KOKKOS_IMPL_CUDA_SYNCWARP;
-        BLOCK_SCAN_STEP(tdata_inter, n, 5)
-#endif
       }
     }
   }
@@ -832,17 +750,17 @@ __device__ void cuda_intra_block_reduce_scan(
                                               : ((rtid_intra & 16) ? 16 : 0))));
 
     if (!(rtid_intra + n < blockDim.y)) n = 0;
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 4) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 3) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 2) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 1) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
     BLOCK_SCAN_STEP(tdata_intra, n, 0) __threadfence_block();
-    KOKKOS_IMPL_CUDA_SYNCWARP;
+    __syncwarp(0xffffffff);
   }
 
 #undef BLOCK_SCAN_STEP
@@ -858,12 +776,13 @@ __device__ void cuda_intra_block_reduce_scan(
  *  Global reduce result is in the last threads' 'shared_data' location.
  */
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan2(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
-  using size_type   = Cuda::size_type;
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
+  using size_type   = SizeType;
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   using ValueJoin   = FunctorValueJoin<FunctorType, ArgTag>;
   using ValueInit   = FunctorValueInit<FunctorType, ArgTag>;
@@ -953,11 +872,12 @@ __device__ bool cuda_single_inter_block_reduce_scan2(
   return is_last_block;
 }
 
-template <bool DoScan, class FunctorType, class ArgTag>
+template <bool DoScan, class FunctorType, class ArgTag,
+          class SizeType = Cuda::size_type>
 __device__ bool cuda_single_inter_block_reduce_scan(
     const FunctorType& functor, const Cuda::size_type block_id,
-    const Cuda::size_type block_count, Cuda::size_type* const shared_data,
-    Cuda::size_type* const global_data, Cuda::size_type* const global_flags) {
+    const Cuda::size_type block_count, SizeType* const shared_data,
+    SizeType* const global_data, Cuda::size_type* const global_flags) {
   using ValueTraits = FunctorValueTraits<FunctorType, ArgTag>;
   if (!DoScan && ValueTraits::StaticValueSize > 0)
     return Kokkos::Impl::CudaReductionsFunctor<
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
index 2004edbeac..88ac0d1878 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp
@@ -54,11 +54,27 @@
 #include <Kokkos_Core_fwd.hpp>
 
 #include <impl/Kokkos_TaskBase.hpp>
-#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // KOKKOS_IMPL_CUDA_SAFE_CALL
 #include <impl/Kokkos_TaskTeamMember.hpp>
 
 //----------------------------------------------------------------------------
 
+#if defined(__CUDA_ARCH__)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
+  {                                                                        \
+    __syncwarp();                                                          \
+    const unsigned b = __activemask();                                     \
+    if (b != 0xffffffff) {                                                 \
+      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
+             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
+             threadIdx.z, b);                                              \
+      return;                                                              \
+    }                                                                      \
+  }
+#else
+#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
+#endif
+
 namespace Kokkos {
 namespace Impl {
 namespace {
@@ -138,13 +154,13 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
       // Broadcast task pointer:
 
       // Sync before the broadcast
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // pretend it's an int* for shuffle purposes
       ((int*)&current_task)[0] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[0], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[0], 0, 32);
       ((int*)&current_task)[1] =
-          KOKKOS_IMPL_CUDA_SHFL(((int*)&current_task)[1], 0, 32);
+          __shfl_sync(0xffffffff, ((int*)&current_task)[1], 0, 32);
 
       if (current_task) {
         KOKKOS_ASSERT(!current_task->as_runnable_task().get_respawn_flag());
@@ -168,7 +184,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (shared_memory_task_copy->is_team_runnable()) {
           // Thread Team Task
@@ -182,7 +198,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // Synchronize threads of the warp and insure memory
         // writes are visible to all threads in the warp.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // if(warp_lane < b % CudaTraits::WarpSize) b += CudaTraits::WarpSize;
         // b -= b % CudaTraits::WarpSize;
@@ -196,7 +212,7 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
         // writes are visible to root thread of the warp for
         // respawn or completion.
 
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (warp_lane == 0) {
           // If respawn requested copy respawn data back to main memory
@@ -249,12 +265,14 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
 
     auto& queue = scheduler.queue();
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Task Execution");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -262,18 +280,21 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     const size_t larger_stack_size = 1 << 11;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Task Execution");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -295,13 +316,17 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Cuda, QueueType>> {
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Pre Get Function Pointer for Tasks");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::"
+        "Cuda>::execute: Post Get Function Pointer for Tasks");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -372,23 +397,20 @@ class TaskQueueSpecializationConstrained<
           // count of 0 also. Otherwise, returns a task from another queue
           // or `end` if one couldn't be popped
           task_ptr = team_queue.attempt_to_steal_task();
-#if 0
-          if(task != no_more_tasks_sentinel && task != end) {
-            std::printf("task stolen on rank %d\n", team_exec.league_rank());
-          }
-#endif
         }
       }
 
       // Synchronize warp with memory fence before broadcasting task pointer:
 
       // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "A" );
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
 
       // Broadcast task pointer:
 
-      ((int*)&task_ptr)[0] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[0], 0, 32);
-      ((int*)&task_ptr)[1] = KOKKOS_IMPL_CUDA_SHFL(((int*)&task_ptr)[1], 0, 32);
+      ((int*)&task_ptr)[0] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[0], 0, 32);
+      ((int*)&task_ptr)[1] =
+          __shfl_sync(0xffffffff, ((int*)&task_ptr)[1], 0, 32);
 
 #if defined(KOKKOS_ENABLE_DEBUG)
       KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN("TaskQueue CUDA task_ptr");
@@ -418,7 +440,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "B" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         if (task_root_type::TaskTeam == task_shmem->m_task_type) {
           // Thread Team Task
@@ -432,7 +454,7 @@ class TaskQueueSpecializationConstrained<
         // writes are visible to all threads in the warp.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "C" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // copy task closure from shared to global memory:
 
@@ -445,7 +467,7 @@ class TaskQueueSpecializationConstrained<
         // respawn or completion.
 
         // KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN( "D" );
-        KOKKOS_IMPL_CUDA_SYNCWARP;
+        __syncwarp(0xffffffff);
 
         // If respawn requested copy respawn data back to main memory
 
@@ -475,12 +497,14 @@ class TaskQueueSpecializationConstrained<
     auto& queue = scheduler.queue();
     queue.initialize_team_queues(warps_per_block * grid.x);
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Pre Execute Task");
 
     // Query the stack size, in bytes:
 
     size_t previous_stack_size = 0;
-    CUDA_SAFE_CALL(
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
         cudaDeviceGetLimit(&previous_stack_size, cudaLimitStackSize));
 
     // If not large enough then set the stack size, in bytes:
@@ -488,18 +512,21 @@ class TaskQueueSpecializationConstrained<
     const size_t larger_stack_size = 2048;
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
+          cudaDeviceSetLimit(cudaLimitStackSize, larger_stack_size));
     }
 
     cuda_task_queue_execute<<<grid, block, shared_total, stream>>>(
         scheduler, shared_per_warp);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::execute: Post Execute Task");
 
     if (previous_stack_size < larger_stack_size) {
-      CUDA_SAFE_CALL(
+      KOKKOS_IMPL_CUDA_SAFE_CALL(
           cudaDeviceSetLimit(cudaLimitStackSize, previous_stack_size));
     }
   }
@@ -516,13 +543,17 @@ class TaskQueueSpecializationConstrained<
     destroy_type* dtor_ptr =
         (destroy_type*)((char*)storage + sizeof(function_type));
 
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Pre Get Function Pointer");
 
     set_cuda_task_base_apply_function_pointer<TaskType>
         <<<1, 1>>>(ptr_ptr, dtor_ptr);
 
-    CUDA_SAFE_CALL(cudaGetLastError());
-    CUDA_SAFE_CALL(cudaDeviceSynchronize());
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetLastError());
+    Impl::cuda_device_synchronize(
+        "Kokkos::Impl::TaskQueueSpecializationConstrained<SimpleTaskScheduler<"
+        "Kokkos::Cuda>::get_function_pointer: Post Get Function Pointer");
 
     ptr  = *ptr_ptr;
     dtor = *dtor_ptr;
@@ -609,7 +640,7 @@ class TaskExec<Kokkos::Cuda, Scheduler> {
 
   __device__ void team_barrier() const {
     if (1 < m_team_size) {
-      KOKKOS_IMPL_CUDA_SYNCWARP;
+      __syncwarp(0xffffffff);
     }
   }
 
@@ -1205,5 +1236,7 @@ KOKKOS_INLINE_FUNCTION void single(
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
+#undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN
+
 #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
 #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
index e780639015..922b980a25 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@@ -340,191 +340,6 @@ class CudaTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __CUDA_ARCH__
-
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of CUDA threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn =
-        (nid + CudaTraits::WarpIndexMask) >> CudaTraits::WarpIndexShift;
-    const int wx = tid & CudaTraits::WarpIndexMask;
-    const int wy = tid >> CudaTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing CUDA threads
-
-      value_type tmp(reducer.reference());
-
-      for (int i = CudaTraits::WarpSize; (int)blockDim.x <= (i >>= 1);) {
-        Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                 CudaTraits::WarpSize);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        ((value_type*)shmem)[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join(((value_type*)shmem) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (32 - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy(((pointer_type)global_scratch_space) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = (int)gridDim.x ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      last_block = __syncthreads_or(last_block);
-
-      if (!last_block) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(((pointer_type)shmem) + offset,
-                   ((pointer_type)global_scratch_space) + offset);
-
-      for (int i = nentry + tid; i < (int)gridDim.x; i += nentry) {
-        reducer.join(
-            ((pointer_type)shmem) + offset,
-            ((pointer_type)global_scratch_space) + i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      const int nreduce =
-          CudaTraits::WarpSize < nentry ? CudaTraits::WarpSize : nentry;
-
-      // nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (32 - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), (pointer_type)shmem);
-        return 1;
-      }
-    }
-    return 0;
-
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
@@ -533,7 +348,7 @@ class CudaTeamMember {
                  void* scratch_level_1_ptr, const int scratch_level_1_size,
                  const int arg_league_rank, const int arg_league_size)
       : m_team_reduce(shared),
-        m_team_shared(((char*)shared) + shared_begin, shared_size,
+        m_team_shared(static_cast<char*>(shared) + shared_begin, shared_size,
                       scratch_level_1_ptr, scratch_level_1_size),
         m_team_reduce_size(shared_begin),
         m_league_rank(arg_league_rank),
@@ -854,14 +669,10 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
        i += blockDim.x) {
     closure(i);
   }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1100,14 +911,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
@@ -1118,14 +925,10 @@ KOKKOS_INLINE_FUNCTION void single(
   (void)lambda;
 #ifdef __CUDA_ARCH__
   if (threadIdx.x == 0 && threadIdx.y == 0) lambda();
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(
-      blockDim.x == 32 ? 0xffffffff
-                       : ((1 << blockDim.x) - 1)
-                             << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
-#else
-  KOKKOS_IMPL_CUDA_SYNCWARP;
-#endif
+  __syncwarp(blockDim.x == 32
+                 ? 0xffffffff
+                 : ((1 << blockDim.x) - 1)
+                       << (threadIdx.y % (32 / blockDim.x)) * blockDim.x);
 #endif
 }
 
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
index 7f7b7b6e78..31d3c47e1c 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -48,7 +48,12 @@
 #ifdef KOKKOS_ENABLE_CUDA
 
 #include <type_traits>
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
+
+#if !defined(KOKKOS_COMPILER_CLANG)
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
+#else
+#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
+#endif
 
 namespace Kokkos {
 
@@ -61,7 +66,7 @@ constexpr unsigned shfl_all_mask = 0xffffffffu;
 // Shuffle operations require input to be a register (stack) variable
 
 // Derived implements do_shfl_op(unsigned mask, T& in, int lane, int width),
-// which turns in to one of KOKKOS_IMPL_CUDA_SHFL(_UP_|_DOWN_|_)MASK
+// which turns in to one of __shfl_sync(_up|_down)
 // Since the logic with respect to value sizes, etc., is the same everywhere,
 // put it all in one place.
 template <class Derived>
@@ -157,7 +162,7 @@ struct in_place_shfl_fn : in_place_shfl_op<in_place_shfl_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_MASK(mask, val, lane, width);
+    return __shfl_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -170,7 +175,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op<in_place_shfl_up_fn> {
   __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val,
                                                   int lane, int width) const
       noexcept {
-    return KOKKOS_IMPL_CUDA_SHFL_UP_MASK(mask, val, lane, width);
+    return __shfl_up_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -188,7 +193,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op<in_place_shfl_down_fn> {
     (void)val;
     (void)lane;
     (void)width;
-    return KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(mask, val, lane, width);
+    return __shfl_down_sync(mask, val, lane, width);
   }
 };
 template <class... Args>
@@ -228,5 +233,7 @@ __device__ inline T shfl_up(const T& val, int delta, int width,
 
 }  // end namespace Kokkos
 
+#undef KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF
+
 #endif  // defined( KOKKOS_ENABLE_CUDA )
 #endif  // !defined( KOKKOS_CUDA_VECTORIZATION_HPP )
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
deleted file mode 100644
index 0cdd84ce27..0000000000
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <Kokkos_Macros.hpp>
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
-#define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
-#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(), x)
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) __ballot_sync(m, x)
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) __shfl_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) __shfl_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) __shfl_up_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_UP_MASK(m, x, y, z) __shfl_up_sync(m, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) \
-  __shfl_down_sync(0xffffffff, x, y, z)
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) __shfl_down_sync(m, x, y, z)
-#else
-#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
-#define KOKKOS_IMPL_CUDA_SYNCWARP
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) (void)m
-#define KOKKOS_IMPL_CUDA_BALLOT(x) 0
-#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m, x) 0
-#define KOKKOS_IMPL_CUDA_SHFL(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_MASK(m, x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_UP(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x, y, z) 0
-#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m, x, y, z) 0
-#endif
-
-#if !defined(KOKKOS_COMPILER_CLANG)
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(long long)
-#else
-#define KOKKOS_IMPL_CUDA_MAX_SHFL_SIZEOF sizeof(int)
-#endif
-
-#if defined(__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)                           \
-  {                                                                        \
-    __syncwarp();                                                          \
-    const unsigned b = __activemask();                                     \
-    if (b != 0xffffffff) {                                                 \
-      printf(" SYNCWARP AT %s (%d,%d,%d) (%d,%d,%d) failed %x\n", MSG,     \
-             blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, \
-             threadIdx.z, b);                                              \
-      return;                                                              \
-    }                                                                      \
-  }
-#else
-#define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG)
-#endif
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
index 9278d1bdc9..7eb3e1e9f7 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp
@@ -45,6 +45,7 @@
 #ifndef KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 #define KOKKOS_HIP_BLOCKSIZE_DEDUCTION_HPP
 
+#include <functional>
 #include <Kokkos_Macros.hpp>
 
 #if defined(__HIPCC__)
@@ -56,118 +57,239 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
-template <typename DriverType, bool, int MaxThreadsPerBlock, int MinBlocksPerSM>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  // FIXME_HIP - currently the "constant" path is unimplemented.
-  //             we should look at whether it's functional, and
-  //             perform some simple scaling studies to see when /
-  //             if the constant launcher outperforms the current
-  //             pass by pointer shared launcher
-  HIP_SAFE_CALL(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-      numBlocks,
-      hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
-                                       MinBlocksPerSM>,
-      blockSize, sharedmem));
-}
+enum class BlockType { Max, Preferred };
 
-template <typename DriverType, bool constant>
-void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) {
-  hipOccupancy<DriverType, constant, HIPTraits::MaxThreadsPerBlock, 1>(
-      numBlocks, blockSize, sharedmem);
-}
-
-template <class FunctorType, class LaunchBounds, typename F>
-int hip_internal_get_block_size(const F &condition_check,
-                                const HIPInternal *hip_instance,
-                                const hipFuncAttributes &attr,
-                                const FunctorType &f,
-                                const size_t vector_length,
-                                const size_t shmem_block,
-                                const size_t shmem_thread) {
-  const int min_blocks_per_sm =
-      LaunchBounds::minBperSM == 0 ? 1 : LaunchBounds::minBperSM;
-  const int max_threads_per_block = LaunchBounds::maxTperB == 0
-                                        ? HIPTraits::MaxThreadsPerBlock
-                                        : LaunchBounds::maxTperB;
-
-  const int regs_per_wavefront  = std::max(attr.numRegs, 1);
-  const int regs_per_sm         = hip_instance->m_regsPerSM;
-  const int shmem_per_sm        = hip_instance->m_shmemPerSM;
-  const int max_shmem_per_block = hip_instance->m_maxShmemPerBlock;
-  const int max_blocks_per_sm   = hip_instance->m_maxBlocksPerSM;
-  const int max_threads_per_sm  = hip_instance->m_maxThreadsPerSM;
-
-  int block_size = max_threads_per_block;
-  KOKKOS_ASSERT(block_size > 0);
-  const int blocks_per_warp =
-      (block_size + HIPTraits::WarpSize - 1) / HIPTraits::WarpSize;
-
-  int functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-      f, block_size / vector_length);
-  int total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                    functor_shmem + attr.sharedSizeBytes;
-  int max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-  int max_blocks_shmem =
-      (total_shmem < max_shmem_per_block)
-          ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-          : 0;
-  int blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-  int threads_per_sm = blocks_per_sm * block_size;
-  if (threads_per_sm > max_threads_per_sm) {
-    blocks_per_sm  = max_threads_per_sm / block_size;
-    threads_per_sm = blocks_per_sm * block_size;
-  }
-  int opt_block_size =
-      (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm;
-  int opt_threads_per_sm = threads_per_sm;
-  block_size -= HIPTraits::WarpSize;
-  while (condition_check(blocks_per_sm) &&
-         (block_size >= HIPTraits::WarpSize)) {
-    functor_shmem = ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
-        f, block_size / vector_length);
-    total_shmem = shmem_block + shmem_thread * (block_size / vector_length) +
-                  functor_shmem + attr.sharedSizeBytes;
-    max_blocks_regs = regs_per_sm / (regs_per_wavefront * blocks_per_warp);
-    max_blocks_shmem =
-        (total_shmem < max_shmem_per_block)
-            ? (total_shmem > 0 ? shmem_per_sm / total_shmem : max_blocks_regs)
-            : 0;
-    blocks_per_sm  = std::min(max_blocks_regs, max_blocks_shmem);
-    threads_per_sm = blocks_per_sm * block_size;
-    if (threads_per_sm > max_threads_per_sm) {
-      blocks_per_sm  = max_threads_per_sm / block_size;
-      threads_per_sm = blocks_per_sm * block_size;
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_preferred_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    if (HIPParallelLaunch<DriverType, LaunchBounds,
+                          LaunchMechanism>::get_scratch_size() > 0) {
+      return HIPTraits::ConservativeThreadsPerBlock;
     }
-    if ((blocks_per_sm >= min_blocks_per_sm) &&
-        (blocks_per_sm <= max_blocks_per_sm)) {
-      if (threads_per_sm >= opt_threads_per_sm) {
-        opt_block_size     = block_size;
-        opt_threads_per_sm = threads_per_sm;
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// FIXME_HIP - entire function could be constexpr for c++17
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+unsigned get_max_blocksize_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // use the user specified value
+    return LaunchBounds::maxTperB;
+  } else {
+    // we can always fit 1024 threads blocks if we only care about registers
+    // ... and don't mind spilling
+    return HIPTraits::MaxThreadsPerBlock;
+  }
+}
+
+// convenience method to select and return the proper function attributes
+// for a kernel, given the launch bounds et al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          BlockType BlockSize = BlockType::Max,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+hipFuncAttributes get_hip_func_attributes_impl() {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    return HIPParallelLaunch<DriverType, LaunchBounds,
+                             LaunchMechanism>::get_hip_func_attributes();
+  } else {
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      return HIPParallelLaunch<
+          DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+          LaunchMechanism>::get_hip_func_attributes();
+    } else {
+      const int blocksize =
+          get_preferred_blocksize_impl<DriverType, LaunchBounds,
+                                       LaunchMechanism>();
+      if (blocksize == HIPTraits::MaxThreadsPerBlock) {
+        return HIPParallelLaunch<
+            DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
+      } else {
+        return HIPParallelLaunch<
+            DriverType,
+            Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+            LaunchMechanism>::get_hip_func_attributes();
       }
     }
-    block_size -= HIPTraits::WarpSize;
   }
-  return opt_block_size;
 }
 
-template <class FunctorType, class LaunchBounds>
-int hip_get_max_block_size(const HIPInternal *hip_instance,
-                           const hipFuncAttributes &attr, const FunctorType &f,
-                           const size_t vector_length, const size_t shmem_block,
-                           const size_t shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int x) { return x == 0; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Given an initial block-size limitation based on register usage
+// determine the block size to select based on LDS limitation
+template <BlockType BlockSize, class DriverType, class LaunchBounds,
+          typename ShmemFunctor>
+unsigned hip_internal_get_block_size(const HIPInternal *hip_instance,
+                                     const ShmemFunctor &f,
+                                     const unsigned tperb_reg) {
+  // translate LB from CUDA to HIP
+  const unsigned min_waves_per_eu =
+      LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1;
+  const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize;
+  const unsigned shmem_per_sm       = hip_instance->m_shmemPerSM;
+  unsigned block_size               = tperb_reg;
+  do {
+    unsigned total_shmem = f(block_size);
+    // find how many threads we can fit with this blocksize based on LDS usage
+    unsigned tperb_shmem = total_shmem > shmem_per_sm ? 0 : block_size;
+
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      // we want the maximum blocksize possible
+      // just wait until we get a case where we can fit the LDS per SM
+      if (tperb_shmem) return block_size;
+    } else {
+      if (block_size == tperb_reg && tperb_shmem >= tperb_reg) {
+        // fast path for exit on first iteration if registers are more limiting
+        // than LDS usage, just use the register limited size
+        return tperb_reg;
+      }
+      // otherwise we need to apply a heuristic to choose the blocksize
+      // the current launchbound selection scheme is:
+      //      1. If no spills, choose 1024 [MaxThreadsPerBlock]
+      //      2. Otherwise, choose 256 [ConservativeThreadsPerBlock]
+      //
+      // For blocksizes between 256 and 1024, we'll be forced to use the 1024 LB
+      // and we'll already have pretty decent occupancy, thus dropping to 256
+      // *probably* isn't a concern
+      const unsigned blocks_per_cu_shmem = shmem_per_sm / total_shmem;
+      const unsigned tperb = tperb_shmem < tperb_reg ? tperb_shmem : tperb_reg;
+
+      // for anything with > 4 WF's & can fit multiple blocks
+      // we're probably not occupancy limited so just return that
+      if (blocks_per_cu_shmem > 1 &&
+          tperb > HIPTraits::ConservativeThreadsPerBlock) {
+        return block_size;
+      }
+
+      // otherwise, it's probably better to drop to the first valid size that
+      // fits in the ConservativeThreadsPerBlock
+      if (tperb >= min_threads_per_sm) return block_size;
+    }
+    block_size >>= 1;
+  } while (block_size >= HIPTraits::WarpSize);
+  // TODO: return a negative, add an error to kernel launch
+  return 0;
 }
 
-template <typename FunctorType, typename LaunchBounds>
-int hip_get_opt_block_size(HIPInternal const *hip_instance,
-                           hipFuncAttributes const &attr, FunctorType const &f,
-                           size_t const vector_length, size_t const shmem_block,
-                           size_t const shmem_thread) {
-  return hip_internal_get_block_size<FunctorType, LaunchBounds>(
-      [](int) { return true; }, hip_instance, attr, f, vector_length,
-      shmem_block, shmem_thread);
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the preferred blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_preferred_blocksize() {
+  return get_preferred_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for parallel constructs with no LDS usage
+// Returns the max blocksize as dictated by register usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds>
+unsigned hip_get_max_blocksize() {
+  return get_max_blocksize_impl<DriverType, LaunchBounds>();
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_preferred_blocksize(HIPInternal const *hip_instance,
+                                     ShmemFunctor const &f) {
+  // get preferred blocksize limited by register usage
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the 'preferred' blocksize, as determined by the heuristics in
+// hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_preferred_team_blocksize(HIPInternal const *hip_instance,
+                                          ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds,
+                                   BlockType::Preferred>();
+  // get preferred blocksize limited by register usage
+  using namespace std::placeholders;
+  const unsigned tperb_reg =
+      hip_get_preferred_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Preferred, DriverType,
+                                     LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
+}
+
+// Standardized blocksize deduction for non-teams parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemFunctor takes a single argument of the current blocksize under
+// consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds, typename ShmemFunctor>
+unsigned hip_get_max_blocksize(HIPInternal const *hip_instance,
+                               ShmemFunctor const &f) {
+  // get max blocksize limited by register usage
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, f, tperb_reg);
+}
+
+// Standardized blocksize deduction for teams-based parallel constructs with LDS
+// usage Returns the maximum possible blocksize, as determined by the heuristics
+// in hip_internal_get_block_size
+//
+// The ShmemTeamsFunctor takes two arguments: the hipFunctionAttributes and
+//  the current blocksize under consideration, and returns the LDS usage
+//
+// Note: a returned block_size of zero indicates that the algorithm could not
+//       find a valid block size.  The caller is responsible for error handling.
+template <typename DriverType, typename LaunchBounds,
+          typename ShmemTeamsFunctor>
+unsigned hip_get_max_team_blocksize(HIPInternal const *hip_instance,
+                                    ShmemTeamsFunctor const &f) {
+  hipFuncAttributes attr =
+      get_hip_func_attributes_impl<DriverType, LaunchBounds, BlockType::Max>();
+  // get max blocksize
+  using namespace std::placeholders;
+  const unsigned tperb_reg = hip_get_max_blocksize<DriverType, LaunchBounds>();
+  return hip_internal_get_block_size<BlockType::Max, DriverType, LaunchBounds>(
+      hip_instance, std::bind(f, attr, _1), tperb_reg);
 }
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
index b3480bcad0..a75e7a4a6c 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Error.hpp
@@ -66,12 +66,30 @@ inline void hip_internal_safe_call(hipError_t e, const char* name,
   }
 }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
+KOKKOS_DEPRECATED
+inline void hip_internal_safe_call_deprecated(hipError_t e, const char* name,
+                                              const char* file = nullptr,
+                                              const int line   = 0) {
+  hip_internal_safe_call(e, name, file, line);
+}
+
+#endif
+
 }  // namespace Impl
 }  // namespace Kokkos
 
-#define HIP_SAFE_CALL(call) \
+#define KOKKOS_IMPL_HIP_SAFE_CALL(call) \
   Kokkos::Impl::hip_internal_safe_call(call, #call, __FILE__, __LINE__)
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+#define HIP_SAFE_CALL(call)                                              \
+  Kokkos::Impl::hip_internal_safe_call_deprecated(call, #call, __FILE__, \
+                                                  __LINE__)
+
+#endif
+
 namespace Kokkos {
 namespace Experimental {
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 18ef10e22c..336ac8c698 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -77,7 +77,7 @@ class HIPInternalDevices {
 };
 
 HIPInternalDevices::HIPInternalDevices() {
-  HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&m_hipDevCount));
 
   if (m_hipDevCount > MAXIMUM_DEVICE_COUNT) {
     Kokkos::abort(
@@ -85,7 +85,7 @@ HIPInternalDevices::HIPInternalDevices() {
         "have. Please report this to github.com/kokkos/kokkos.");
   }
   for (int i = 0; i < m_hipDevCount; ++i) {
-    HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(m_hipProp + i, i));
   }
 }
 
@@ -95,6 +95,9 @@ const HIPInternalDevices &HIPInternalDevices::singleton() {
 }
 }  // namespace
 
+unsigned long *Impl::HIPInternal::constantMemHostStaging = nullptr;
+hipEvent_t Impl::HIPInternal::constantMemReusable        = nullptr;
+
 namespace Impl {
 
 //----------------------------------------------------------------------------
@@ -154,6 +157,9 @@ int HIPInternal::verify_is_initialized(const char *const label) const {
   return 0 <= m_hipDev;
 }
 
+uint32_t HIPInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
 HIPInternal &HIPInternal::singleton() {
   static HIPInternal *self = nullptr;
   if (!self) {
@@ -163,12 +169,23 @@ HIPInternal &HIPInternal::singleton() {
 }
 
 void HIPInternal::fence() const {
-  HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
-  // can reset our cycle id now as well
-  m_cycleId = 0;
+  fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence");
+}
+void HIPInternal::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+          impl_get_instance_id()},
+      [&]() {
+        KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
+        // can reset our cycle id now as well
+        m_cycleId = 0;
+      });
 }
 
-void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
+void HIPInternal::initialize(int hip_device_id, hipStream_t stream,
+                             bool manage_stream) {
   if (was_finalized)
     Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n");
 
@@ -197,9 +214,10 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_hipDev     = hip_device_id;
     m_deviceProp = hipProp;
 
-    HIP_SAFE_CALL(hipSetDevice(m_hipDev));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev));
 
     m_stream                    = stream;
+    m_manage_stream             = manage_stream;
     m_team_scratch_current_size = 0;
     m_team_scratch_ptr          = nullptr;
 
@@ -222,7 +240,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     // theoretically, we can get 40 WF's / CU, but only can sustain 32
     // see
     // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742
-    m_maxBlocksPerSM = 32;
+    m_maxWavesPerCU = 32;
     // FIXME_HIP - Nick to implement this upstream
     //             Register count comes from Sec. 2.2. "Data Sharing" of the
     //             Vega 7nm ISA document (see the diagram)
@@ -232,7 +250,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
     m_regsPerSM        = 65536;
     m_shmemPerSM       = hipProp.maxSharedMemoryPerMultiProcessor;
     m_maxShmemPerBlock = hipProp.sharedMemPerBlock;
-    m_maxThreadsPerSM  = m_maxBlocksPerSM * HIPTraits::WarpSize;
+    m_maxThreadsPerSM  = m_maxWavesPerCU * HIPTraits::WarpSize;
     //----------------------------------
     // Multiblock reduction uses scratch flags for counters
     // and scratch space for partial reduction values.
@@ -265,8 +283,8 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>(r->data());
 
-      HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
-                              sizeof(uint32_t) * buffer_bound));
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(m_scratchConcurrentBitset, 0,
+                                          sizeof(uint32_t) * buffer_bound));
     }
     //----------------------------------
 
@@ -287,6 +305,15 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) {
 
   // Init the array for used for arbitrarily sized atomics
   if (m_stream == nullptr) ::Kokkos::Impl::initialize_host_hip_lock_arrays();
+
+  // Allocate a staging buffer for constant mem in pinned host memory
+  // and an event to avoid overwriting driver for previous kernel launches
+  if (m_stream == nullptr) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostMalloc((void **)&constantMemHostStaging,
+                                            HIPTraits::ConstantMemoryUsage));
+
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventCreate(&constantMemReusable));
+  }
 }
 
 //----------------------------------------------------------------------------
@@ -339,7 +366,7 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags(
 
     m_scratchFlags = reinterpret_cast<size_type *>(r->data());
 
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipMemset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain));
   }
 
@@ -365,7 +392,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes,
 //----------------------------------------------------------------------------
 
 void HIPInternal::finalize() {
-  this->fence();
+  this->fence("Kokkos::HIPInternal::finalize: fence on finalization");
   was_finalized = true;
   if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) {
     using RecordHIP =
@@ -378,6 +405,9 @@ void HIPInternal::finalize() {
     if (m_team_scratch_current_size > 0)
       Kokkos::kokkos_free<Kokkos::Experimental::HIPSpace>(m_team_scratch_ptr);
 
+    if (m_manage_stream && m_stream != nullptr)
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(m_stream));
+
     m_hipDev                    = -1;
     m_hipArch                   = -1;
     m_multiProcCount            = 0;
@@ -395,28 +425,36 @@ void HIPInternal::finalize() {
     m_team_scratch_ptr          = nullptr;
   }
   if (nullptr != d_driverWorkArray) {
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     d_driverWorkArray = nullptr;
   }
+
+  // only destroy these if we're finalizing the singleton
+  if (this == &singleton()) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
+  }
 }
 
 char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
   std::lock_guard<std::mutex> const lock(m_mutexWorkArray);
   if (d_driverWorkArray == nullptr) {
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
   }
   if (driverTypeSize > m_maxDriverTypeSize) {
     // fence handles the cycle id reset for us
-    fence();
-    HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
+    fence(
+        "Kokkos::HIPInternal::get_next_driver: fence before reallocating "
+        "resources");
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(d_driverWorkArray));
     m_maxDriverTypeSize = driverTypeSize;
     if (m_maxDriverTypeSize % 128 != 0)
       m_maxDriverTypeSize =
           m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128;
-    HIP_SAFE_CALL(
+    KOKKOS_IMPL_HIP_SAFE_CALL(
         hipHostMalloc(&d_driverWorkArray,
                       m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char),
                       hipHostMallocNonCoherent));
@@ -424,7 +462,9 @@ char *HIPInternal::get_next_driver(size_t driverTypeSize) const {
     m_cycleId = (m_cycleId + 1) % m_maxDriverCycles;
     if (m_cycleId == 0) {
       // ensure any outstanding kernels are completed before we wrap around
-      fence();
+      fence(
+          "Kokkos::HIPInternal::get_next_driver: fence before reusing first "
+          "driver");
     }
   }
   return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId];
@@ -462,7 +502,14 @@ Kokkos::Experimental::HIP::size_type *hip_internal_scratch_flags(
 
 namespace Kokkos {
 namespace Impl {
-void hip_device_synchronize() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+void hip_device_synchronize(const std::string &name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
 
 void hip_internal_error_throw(hipError_t e, const char *name, const char *file,
                               const int line) {
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
index f4f88628e3..967c6fdd4b 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -48,6 +48,7 @@
 #define KOKKOS_HIP_INSTANCE_HPP
 
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>
 
 #include <mutex>
 
@@ -59,10 +60,12 @@ struct HIPTraits {
   static int constexpr WarpSize       = 64;
   static int constexpr WarpIndexMask  = 0x003f; /* hexadecimal for 63 */
   static int constexpr WarpIndexShift = 6;      /* WarpSize == 1 << WarpShift*/
+  static int constexpr ConservativeThreadsPerBlock =
+      256;  // conservative fallback blocksize in case of spills
   static int constexpr MaxThreadsPerBlock =
-      1024;  // FIXME_HIP -- assumed constant for now
-
+      1024;  // the maximum we can fit in a block
   static int constexpr ConstantMemoryUsage        = 0x008000; /* 32k bytes */
+  static int constexpr KernelArgumentLimit        = 0x001000; /*  4k bytes */
   static int constexpr ConstantMemoryUseThreshold = 0x000200; /* 512 bytes */
 };
 
@@ -90,7 +93,7 @@ class HIPInternal {
   unsigned m_multiProcCount = 0;
   unsigned m_maxWarpCount   = 0;
   unsigned m_maxBlock       = 0;
-  unsigned m_maxBlocksPerSM = 0;
+  unsigned m_maxWavesPerCU  = 0;
   unsigned m_maxSharedWords = 0;
   int m_regsPerSM;
   int m_shmemPerSM       = 0;
@@ -108,6 +111,8 @@ class HIPInternal {
   mutable int m_cycleId = 0;
   // mutex to access d_driverWorkArray
   mutable std::mutex m_mutexWorkArray;
+  // mutex to access shared memory
+  mutable std::mutex m_mutexSharedMemory;
 
   // Scratch Spaces for Reductions
   size_type m_scratchSpaceCount = 0;
@@ -119,7 +124,10 @@ class HIPInternal {
 
   hipDeviceProp_t m_deviceProp;
 
-  hipStream_t m_stream = nullptr;
+  hipStream_t m_stream   = nullptr;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::HIP>(reinterpret_cast<uintptr_t>(this));
+  bool m_manage_stream = false;
 
   // Team Scratch Level 1 Space
   mutable int64_t m_team_scratch_current_size = 0;
@@ -128,18 +136,25 @@ class HIPInternal {
 
   bool was_finalized = false;
 
+  // FIXME_HIP: these want to be per-device, not per-stream...  use of 'static'
+  // here will break once there are multiple devices though
+  static unsigned long *constantMemHostStaging;
+  static hipEvent_t constantMemReusable;
+
   static HIPInternal &singleton();
 
   int verify_is_initialized(const char *const label) const;
 
   int is_initialized() const { return m_hipDev >= 0; }
 
-  void initialize(int hip_device_id, hipStream_t stream = nullptr);
+  void initialize(int hip_device_id, hipStream_t stream = nullptr,
+                  bool manage_stream = false);
   void finalize();
 
   void print_configuration(std::ostream &) const;
 
   void fence() const;
+  void fence(const std::string &) const;
 
   // returns the next driver type pointer in our work array
   char *get_next_driver(size_t driverTypeSize) const;
@@ -151,13 +166,52 @@ class HIPInternal {
   // Resizing of reduction related scratch spaces
   size_type *scratch_space(const size_type size);
   size_type *scratch_flags(const size_type size);
-
+  uint32_t impl_get_instance_id() const noexcept;
   // Resizing of team level 1 scratch
   void *resize_team_scratch_space(std::int64_t bytes,
                                   bool force_shrink = false);
 };
 
 }  // namespace Impl
+
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+
+namespace Impl {
+inline void create_HIP_instances(std::vector<HIP> &instances) {
+  for (int s = 0; s < int(instances.size()); s++) {
+    hipStream_t stream;
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
+    instances[s] = HIP(stream, true);
+  }
+}
+}  // namespace Impl
+
+template <class... Args>
+std::vector<HIP> partition_space(const HIP &, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  std::vector<HIP> instances(sizeof...(Args));
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
+
+template <class T>
+std::vector<HIP> partition_space(const HIP &, std::vector<T> &weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<HIP> instances(weights.size());
+  Impl::create_HIP_instances(instances);
+  return instances;
+}
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
index f774423b37..f209edf7c0 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
@@ -52,6 +52,7 @@
 #include <HIP/Kokkos_HIP_Error.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_HIP_Space.hpp>
+#include <HIP/Kokkos_HIP_Locks.hpp>
 
 // Must use global variable on the device with HIP-Clang
 #ifdef __HIP__
@@ -64,7 +65,7 @@ namespace Kokkos {
 namespace Experimental {
 template <typename T>
 inline __device__ T *kokkos_impl_hip_shared_memory() {
-  HIP_DYNAMIC_SHARED(HIPSpace::size_type, sh);
+  extern __shared__ Kokkos::Experimental::HIPSpace::size_type sh[];
   return (T *)sh;
 }
 }  // namespace Experimental
@@ -74,10 +75,12 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+// The hip_parallel_launch_*_memory code is identical to the cuda code
 template <typename DriverType>
 __global__ static void hip_parallel_launch_constant_memory() {
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
+
   driver();
 }
 
@@ -87,12 +90,13 @@ __global__ __launch_bounds__(
   const DriverType &driver = *(reinterpret_cast<const DriverType *>(
       kokkos_impl_hip_constant_memory_buffer));
 
-  driver->operator()();
+  driver();
 }
 
 template <class DriverType>
 __global__ static void hip_parallel_launch_local_memory(
     const DriverType *driver) {
+  // FIXME_HIP driver() pass by copy
   driver->operator()();
 }
 
@@ -101,6 +105,21 @@ __global__ __launch_bounds__(
     maxTperB,
     minBperSM) static void hip_parallel_launch_local_memory(const DriverType
                                                                 *driver) {
+  // FIXME_HIP driver() pass by copy
+  driver->operator()();
+}
+
+template <typename DriverType>
+__global__ static void hip_parallel_launch_global_memory(
+    const DriverType *driver) {
+  driver->operator()();
+}
+
+template <typename DriverType, unsigned int maxTperB, unsigned int minBperSM>
+__global__ __launch_bounds__(
+    maxTperB,
+    minBperSM) static void hip_parallel_launch_global_memory(const DriverType
+                                                                 *driver) {
   driver->operator()();
 }
 
@@ -127,33 +146,238 @@ struct HIPDispatchProperties {
   HIPLaunchMechanism launch_mechanism = l;
 };
 
+// Use local memory up to ConstantMemoryUseThreshold
+// Use global memory above ConstantMemoryUsage
+// In between use ConstantMemory
+// The following code is identical to the cuda code
+template <typename DriverType>
+struct DeduceHIPLaunchMechanism {
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintLightWeight_t
+      light_weight = Kokkos::Experimental::WorkItemProperty::HintLightWeight;
+  static constexpr Kokkos::Experimental::WorkItemProperty::HintHeavyWeight_t
+      heavy_weight = Kokkos::Experimental::WorkItemProperty::HintHeavyWeight;
+  static constexpr typename DriverType::Policy::work_item_property property =
+      typename DriverType::Policy::work_item_property();
+
+  static constexpr HIPLaunchMechanism valid_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::Default) |
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+           ? HIPLaunchMechanism::ConstantMemory
+           : HIPLaunchMechanism::Default) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism requested_launch_mechanism =
+      (((property & light_weight) == light_weight)
+           ? HIPLaunchMechanism::LocalMemory
+           : HIPLaunchMechanism::ConstantMemory) |
+      HIPLaunchMechanism::GlobalMemory;
+
+  static constexpr HIPLaunchMechanism default_launch_mechanism =
+      // BuildValidMask
+      (sizeof(DriverType) < HIPTraits::ConstantMemoryUseThreshold)
+          ? HIPLaunchMechanism::LocalMemory
+          : ((sizeof(DriverType) < HIPTraits::ConstantMemoryUsage)
+                 ? HIPLaunchMechanism::ConstantMemory
+                 : HIPLaunchMechanism::GlobalMemory);
+
+  //              None                LightWeight    HeavyWeight
+  // F<UseT       LCG  LCG L  L       LCG  LG L  L   LCG  CG L  C
+  // UseT<F<KAL   LCG  LCG C  C       LCG  LG C  L   LCG  CG C  C
+  // Kal<F<CMU     CG  LCG C  C        CG  LG C  G    CG  CG C  C
+  // CMU<F          G  LCG G  G         G  LG G  G     G  CG G  G
+  static constexpr HIPLaunchMechanism launch_mechanism =
+      ((property & light_weight) == light_weight)
+          ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit
+                 ? HIPLaunchMechanism::LocalMemory
+                 : HIPLaunchMechanism::GlobalMemory)
+          : (((property & heavy_weight) == heavy_weight)
+                 ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage
+                        ? HIPLaunchMechanism::ConstantMemory
+                        : HIPLaunchMechanism::GlobalMemory)
+                 : (default_launch_mechanism));
+};
+
+template <typename DriverType, typename LaunchBounds,
+          HIPLaunchMechanism LaunchMechanism>
+struct HIPParallelLaunchKernelFuncData {
+  static unsigned int get_scratch_size(
+      hipFuncAttributes const &hip_func_attributes) {
+    return hip_func_attributes.localSizeBytes;
+  }
+
+  static hipFuncAttributes get_hip_func_attributes(void const *kernel_func) {
+    static hipFuncAttributes attr = [=]() {
+      hipFuncAttributes attr;
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipFuncGetAttributes(&attr, kernel_func));
+      return attr;
+    }();
+    return attr;
+  }
+};
+
+//---------------------------------------------------------------//
+// HIPParallelLaunchKernelFunc structure and its specializations //
+//---------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelFunc;
 
+// HIPLaunchMechanism::LocalMemory specializations
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
           unsigned int MinBlocksPerSM>
 struct HIPParallelLaunchKernelFunc<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
     HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
     return hip_parallel_launch_local_memory<DriverType, MaxThreadsPerBlock,
                                             MinBlocksPerSM>;
   }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
 };
 
 template <typename DriverType>
 struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
                                    HIPLaunchMechanism::LocalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::LocalMemory>;
   static auto get_kernel_func() {
-    return hip_parallel_launch_local_memory<DriverType, 1024, 1>;
+    return HIPParallelLaunchKernelFunc<
+        DriverType, Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+        HIPLaunchMechanism::LocalMemory>::get_kernel_func();
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType, MaxThreadsPerBlock,
+                                             MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::GlobalMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::GlobalMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_global_memory<DriverType>;
+  }
+
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, unsigned int MaxThreadsPerBlock,
+          unsigned int MinBlocksPerSM>
+struct HIPParallelLaunchKernelFunc<
+    DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+    HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t = HIPParallelLaunchKernelFuncData<
+      DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
+      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType, MaxThreadsPerBlock,
+                                               MinBlocksPerSM>;
+  }
+
+  static constexpr auto default_launchbounds() { return false; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+template <typename DriverType>
+struct HIPParallelLaunchKernelFunc<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                   HIPLaunchMechanism::ConstantMemory> {
+  using funcdata_t =
+      HIPParallelLaunchKernelFuncData<DriverType, Kokkos::LaunchBounds<0, 0>,
+                                      HIPLaunchMechanism::ConstantMemory>;
+  static auto get_kernel_func() {
+    return hip_parallel_launch_constant_memory<DriverType>;
+  }
+  static constexpr auto default_launchbounds() { return true; }
+
+  static auto get_scratch_size() {
+    return funcdata_t::get_scratch_size(get_hip_func_attributes());
+  }
+
+  static hipFuncAttributes get_hip_func_attributes() {
+    return funcdata_t::get_hip_func_attributes(
+        reinterpret_cast<void const *>(get_kernel_func()));
+  }
+};
+
+//------------------------------------------------------------------//
+// HIPParallelLaunchKernelInvoker structure and its specializations //
+//------------------------------------------------------------------//
 template <typename DriverType, typename LaunchBounds,
           HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunchKernelInvoker;
 
+// HIPLaunchMechanism::LocalMemory specialization
 template <typename DriverType, typename LaunchBounds>
 struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
                                       HIPLaunchMechanism::LocalMemory>
@@ -170,21 +394,83 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
   }
 };
 
+// HIPLaunchMechanism::GlobalMemory specialization
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::GlobalMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::GlobalMemory> {
+  using base_t = HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                             HIPLaunchMechanism::GlobalMemory>;
+
+  // FIXME_HIP the code is different than cuda because driver cannot be passed
+  // by copy
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    (base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
+        driver);
+  }
+};
+
+// HIPLaunchMechanism::ConstantMemory specializations
+template <typename DriverType, typename LaunchBounds>
+struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
+                                      HIPLaunchMechanism::ConstantMemory>
+    : HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory> {
+  using base_t =
+      HIPParallelLaunchKernelFunc<DriverType, LaunchBounds,
+                                  HIPLaunchMechanism::ConstantMemory>;
+  static_assert(sizeof(DriverType) < HIPTraits::ConstantMemoryUsage,
+                "Kokkos Error: Requested HIPLaunchConstantMemory with a "
+                "Functor larger than 32kB.");
+
+  static void invoke_kernel(DriverType const *driver, dim3 const &grid,
+                            dim3 const &block, int shmem,
+                            HIPInternal const *hip_instance) {
+    // Wait until the previous kernel that uses the constant buffer is done
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipEventSynchronize(hip_instance->constantMemReusable));
+
+    // Copy functor (synchronously) to staging buffer in pinned host memory
+    unsigned long *staging = hip_instance->constantMemHostStaging;
+    std::memcpy((void *)staging, (void *)driver, sizeof(DriverType));
+
+    // Copy functor asynchronously from there to constant memory on the device
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(
+        HIP_SYMBOL(kokkos_impl_hip_constant_memory_buffer), staging,
+        sizeof(DriverType), 0, hipMemcpyHostToDevice, hip_instance->m_stream));
+
+    // Invoke the driver function on the device
+    (base_t::
+         get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>();
+
+    // Record an event that says when the constant buffer can be reused
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipEventRecord(hip_instance->constantMemReusable,
+                                             hip_instance->m_stream));
+  }
+};
+
+//-----------------------------//
+// HIPParallelLaunch structure //
+//-----------------------------//
 template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
-          HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory>
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
 struct HIPParallelLaunch;
 
 template <typename DriverType, unsigned int MaxThreadsPerBlock,
-          unsigned int MinBlocksPerSM>
+          unsigned int MinBlocksPerSM, HIPLaunchMechanism LaunchMechanism>
 struct HIPParallelLaunch<
     DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-    HIPLaunchMechanism::LocalMemory>
+    LaunchMechanism>
     : HIPParallelLaunchKernelInvoker<
           DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-          HIPLaunchMechanism::LocalMemory> {
+          LaunchMechanism> {
   using base_t = HIPParallelLaunchKernelInvoker<
       DriverType, Kokkos::LaunchBounds<MaxThreadsPerBlock, MinBlocksPerSM>,
-      HIPLaunchMechanism::LocalMemory>;
+      LaunchMechanism>;
 
   HIPParallelLaunch(const DriverType &driver, const dim3 &grid,
                     const dim3 &block, const int shmem,
@@ -205,22 +491,48 @@ struct HIPParallelLaunch<
       base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance);
 
 #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK)
-      HIP_SAFE_CALL(hipGetLastError());
-      hip_instance->fence();
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipGetLastError());
+      hip_instance->fence(
+          "Kokkos::Experimental::Impl::HIParallelLaunch: Debug Only Check for "
+          "Execution Error");
 #endif
     }
   }
-
-  static hipFuncAttributes get_hip_func_attributes() {
-    static hipFuncAttributes attr = []() {
-      hipFuncAttributes attr;
-      HIP_SAFE_CALL(hipFuncGetAttributes(
-          &attr, reinterpret_cast<void const *>(base_t::get_kernel_func())));
-      return attr;
-    }();
-    return attr;
-  }
 };
+
+// convenience method to launch the correct kernel given the launch bounds et
+// al.
+template <typename DriverType, typename LaunchBounds = Kokkos::LaunchBounds<>,
+          HIPLaunchMechanism LaunchMechanism =
+              DeduceHIPLaunchMechanism<DriverType>::launch_mechanism>
+void hip_parallel_launch(const DriverType &driver, const dim3 &grid,
+                         const dim3 &block, const int shmem,
+                         const HIPInternal *hip_instance,
+                         const bool prefer_shmem) {
+  // FIXME_HIP - could be if constexpr for c++17
+  if (!HIPParallelLaunch<DriverType, LaunchBounds,
+                         LaunchMechanism>::default_launchbounds()) {
+    // for user defined, we *always* honor the request
+    HIPParallelLaunch<DriverType, LaunchBounds, LaunchMechanism>(
+        driver, grid, block, shmem, hip_instance, prefer_shmem);
+  } else {
+    // we can do what we like
+    const unsigned flat_block_size = block.x * block.y * block.z;
+    if (flat_block_size <= HIPTraits::ConservativeThreadsPerBlock) {
+      // we have to use the large blocksize
+      HIPParallelLaunch<
+          DriverType,
+          Kokkos::LaunchBounds<HIPTraits::ConservativeThreadsPerBlock, 1>,
+          LaunchMechanism>(driver, grid, block, shmem, hip_instance,
+                           prefer_shmem);
+    } else {
+      HIPParallelLaunch<DriverType,
+                        Kokkos::LaunchBounds<HIPTraits::MaxThreadsPerBlock, 1>,
+                        LaunchMechanism>(driver, grid, block, shmem,
+                                         hip_instance, prefer_shmem);
+    }
+  }
+}
 }  // namespace Impl
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
index 4f5271b6f6..c4292d35ec 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.cpp
@@ -84,11 +84,17 @@ namespace Impl {
 HIPLockArrays g_host_hip_lock_arrays = {nullptr, nullptr, 0};
 
 void initialize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::init_lock_arrays();
+
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE();
+#endif
+
   if (g_host_hip_lock_arrays.atomic != nullptr) return;
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.atomic,
       sizeof(std::int32_t) * (KOKKOS_IMPL_HIP_SPACE_ATOMIC_MASK + 1)));
-  HIP_SAFE_CALL(hipMalloc(
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(
       &g_host_hip_lock_arrays.scratch,
       sizeof(std::int32_t) * (::Kokkos::Experimental::HIP::concurrency())));
 
@@ -103,10 +109,14 @@ void initialize_host_hip_lock_arrays() {
 }
 
 void finalize_host_hip_lock_arrays() {
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+  desul::Impl::finalize_lock_arrays();
+#endif
+
   if (g_host_hip_lock_arrays.atomic == nullptr) return;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.atomic));
   g_host_hip_lock_arrays.atomic = nullptr;
-  HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(g_host_hip_lock_arrays.scratch));
   g_host_hip_lock_arrays.scratch = nullptr;
   g_host_hip_lock_arrays.n       = 0;
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
index f34f85f43b..71b104c2e4 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Locks.hpp
@@ -51,6 +51,10 @@
 
 #include <HIP/Kokkos_HIP_Error.hpp>
 
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics/Lock_Array_HIP.hpp>
+#endif
+
 namespace Kokkos {
 namespace Impl {
 
@@ -147,7 +151,7 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 #define KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                 \
   {                                                             \
     if (::Kokkos::Impl::lock_array_copied == 0) {               \
-      HIP_SAFE_CALL(hipMemcpyToSymbol(                          \
+      KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyToSymbol(              \
           HIP_SYMBOL(::Kokkos::Impl::g_device_hip_lock_arrays), \
           &::Kokkos::Impl::g_host_hip_lock_arrays,              \
           sizeof(::Kokkos::Impl::HIPLockArrays)));              \
@@ -155,6 +159,8 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
     ::Kokkos::Impl::lock_array_copied = 1;                      \
   }
 
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+
 #ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
 #else
@@ -162,6 +168,19 @@ inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
   KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
 #endif
 
+#else
+
+#ifdef KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+// Still Need COPY_CUDA_LOCK_ARRAYS for team scratch etc.
+#define KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  KOKKOS_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()         \
+  DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#endif
+
+#endif /* defined( KOKKOS_ENABLE_IMPL_DESUL_ATOMICS ) */
+
 #endif /* defined( __HIPCC__ ) */
 
 #endif /* #ifndef KOKKOS_HIP_LOCKS_HPP */
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
index ce1aff9586..acb538e1cb 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp
@@ -28,7 +28,8 @@ inline TileSizeProperties get_tile_size_properties<Kokkos::Experimental::HIP>(
       space.impl_internal_space_instance()->m_maxThreadsPerSM;
   properties.default_largest_tile_size = 16;
   properties.default_tile_size         = 4;
-  properties.max_total_tile_size       = 1024;
+  properties.max_total_tile_size =
+      Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
   return properties;
 }
 
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
index 35e7d6fb85..eae323dd91 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp
@@ -81,6 +81,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   inline void execute() const {
+    using ClosureType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     if (m_policy.m_num_tiles == 0) return;
     array_index_type const maxblocks = static_cast<array_index_type>(
         m_policy.space().impl_internal_space_instance()->m_maxBlock);
@@ -94,7 +96,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        block.y,
                    maxblocks),
           1);
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 3) {
@@ -110,7 +113,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 4) {
@@ -128,7 +132,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 5) {
@@ -147,7 +152,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
           std::min((m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) /
                        block.z,
                    maxblocks));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else if (Policy::rank == 6) {
@@ -165,7 +171,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                       std::min(static_cast<index_type>(m_policy.m_tile_end[4] *
                                                        m_policy.m_tile_end[5]),
                                static_cast<index_type>(maxblocks)));
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, 0,
           m_policy.space().impl_internal_space_instance(), false);
     } else {
@@ -178,22 +185,18 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                     Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due to registers/SM, MDRange doesn't have
-    // shared memory constraints
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0)
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "tile size."));
+    return block_size;
   }
 };
 
@@ -242,6 +245,9 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_device_accessible;
   size_type* m_scratch_space;
   size_type* m_scratch_flags;
+  // Only let one Parallel/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
   using DeviceIteratePattern = typename Kokkos::Impl::Reduce::DeviceIterateTile<
       Policy::rank, Policy, FunctorType, WorkTag, reference_type>;
@@ -307,32 +313,30 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   // Determine block size constrained by shared memory:
   // This is copy/paste from Kokkos_HIP_Parallel_Range
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-        false, FunctorType, WorkTag>(f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
-          false, FunctorType, WorkTag>(f, n);
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using closure_type = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<closure_type,
+                                                                LaunchBounds>(
+            instance, shmem_functor);
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
     }
-    return n;
+    return block_size;
   }
 
   inline void execute() {
-    const int nwork = m_policy.m_num_tiles;
+    using ClosureType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                       Kokkos::Experimental::HIP>;
+    const int nwork   = m_policy.m_num_tiles;
     if (nwork) {
       int block_size = m_policy.m_prod_tile_dims;
       // CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
@@ -366,14 +370,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem<
               false, FunctorType, WorkTag>(m_functor, block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<ClosureType,
+                                                      LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().fence();
+        m_policy.space().fence(
+            "Kokkos::Impl::ParallelReduce<MDRangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -403,7 +409,10 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
             MemorySpaceAccess<Kokkos::Experimental::HIPSpace,
                               typename ViewType::memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -416,23 +425,25 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                               typename ReducerType::result_view_type::
                                   memory_space>::accessible),
         m_scratch_space(nullptr),
-        m_scratch_flags(nullptr) {}
+        m_scratch_flags(nullptr),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
+
   template <typename Policy, typename Functor>
-  static int max_tile_size_product(const Policy& pol, const Functor&) {
+  static int max_tile_size_product(const Policy&, const Functor&) {
     using closure_type =
         ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>,
                        ReducerType, Kokkos::Experimental::HIP>;
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    auto const& prop = pol.space().hip_device_prop();
-    // Limits due do registers/SM
-    int const regs_per_sm        = prop.regsPerMultiprocessor;
-    int const regs_per_thread    = attr.numRegs;
-    int const max_threads_per_sm = regs_per_sm / regs_per_thread;
-    return std::min(
-        max_threads_per_sm,
-        static_cast<int>(
-            Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock));
+    unsigned block_size =
+        Kokkos::Experimental::Impl::hip_get_max_blocksize<closure_type,
+                                                          LaunchBounds>();
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid tile size."));
+    }
+    return block_size;
   }
 };
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
index 7d2825eeb4..e02ead1e99 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp
@@ -108,16 +108,21 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   inline void execute() const {
     const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
 
+    using DriverType =
+        ParallelFor<FunctorType, Policy, Kokkos::Experimental::HIP>;
     const int block_size =
-        LaunchBounds::maxTperB
-            ? LaunchBounds::maxTperB
-            : ::Kokkos::Experimental::Impl::HIPTraits::
-                  MaxThreadsPerBlock;  // FIXME_HIP Choose block_size better
+        Kokkos::Experimental::Impl::hip_get_preferred_blocksize<DriverType,
+                                                                LaunchBounds>();
     const dim3 block(1, block_size, 1);
     const dim3 grid(
         typename Policy::index_type((nwork + block.y - 1) / block.y), 1, 1);
 
-    Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, LaunchBounds>(
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a "
+                      "valid execution configuration."));
+    }
+    Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
         *this, grid, block, 0, m_policy.space().impl_internal_space_instance(),
         false);
   }
@@ -173,15 +178,12 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   const bool m_result_ptr_host_accessible;
   size_type* m_scratch_space = nullptr;
   size_type* m_scratch_flags = nullptr;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
-#if HIP_VERSION < 401
-  static bool constexpr UseShflReduction =
-      ((sizeof(value_type) > 2 * sizeof(double)) &&
-       static_cast<bool>(ValueTraits::StaticValueSize));
-#else
   static bool constexpr UseShflReduction =
       static_cast<bool>(ValueTraits::StaticValueSize);
-#endif
 
  private:
   struct ShflReductionTag {};
@@ -328,30 +330,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   // Determine block size constrained by shared memory:
   inline unsigned local_block_size(const FunctorType& f) {
-    unsigned int n =
-        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock;
-    int shmem_size =
-        hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-            f, n);
-    using closure_type = Impl::ParallelReduce<FunctorType, Policy, ReducerType>;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type, LaunchBounds>::get_hip_func_attributes();
-    while (
-        (n &&
-         (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
-          shmem_size)) ||
-        (n >
-         static_cast<unsigned int>(
-             ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                  LaunchBounds>(
-                 m_policy.space().impl_internal_space_instance(), attr, f, 1,
-                 shmem_size, 0)))) {
-      n >>= 1;
-      shmem_size =
-          hip_single_inter_block_reduce_scan_shmem<false, FunctorType, WorkTag>(
-              f, n);
-    }
-    return n;
+    const auto& instance = m_policy.space().impl_internal_space_instance();
+    auto shmem_functor   = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      WorkTag>(f, n);
+    };
+    using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                      Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, LaunchBounds>(instance, shmem_functor);
   }
 
   inline void execute() {
@@ -362,7 +349,11 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                  !std::is_same<ReducerType, InvalidType>::value;
     if ((nwork > 0) || need_device_set) {
       const int block_size = local_block_size(m_functor);
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       m_scratch_space =
           ::Kokkos::Experimental::Impl::hip_internal_scratch_space(
@@ -391,14 +382,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                                                          WorkTag>(m_functor,
                                                                   block.y);
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    LaunchBounds>(
+      using DriverType = ParallelReduce<FunctorType, Policy, ReducerType,
+                                        Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       if (!m_result_ptr_device_accessible) {
-        m_policy.space().impl_internal_space_instance()->fence();
+        m_policy.space().impl_internal_space_instance()->fence(
+            "Kokkos::Impl::ParallelReduce<RangePolicy,HIP>: fence because "
+            "reduction can't access result storage location");
 
         if (m_result_ptr) {
           const int size = ValueTraits::value_size(
@@ -429,7 +423,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                               typename ViewType::memory_space>::accessible),
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
-                              typename ViewType::memory_space>::accessible) {}
+                              typename ViewType::memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 
   ParallelReduce(const FunctorType& arg_functor, const Policy& arg_policy,
                  const ReducerType& reducer)
@@ -444,7 +441,10 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
         m_result_ptr_host_accessible(
             MemorySpaceAccess<Kokkos::HostSpace,
                               typename ReducerType::result_view_type::
-                                  memory_space>::accessible) {}
+                                  memory_space>::accessible),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
@@ -482,6 +482,9 @@ class ParallelScanHIPBase {
   size_type* m_scratch_flags = nullptr;
   size_type m_final          = false;
   int m_grid_x               = 0;
+  // Only let one ParallelReduce/Scan modify the shared memory. The
+  // constructor acquires the mutex which is released in the destructor.
+  std::unique_lock<std::mutex> m_shared_memory_lock;
 
  private:
   template <class TagType>
@@ -624,22 +627,7 @@ class ParallelScanHIPBase {
   }
 
   // Determine block size constrained by shared memory:
-  inline unsigned local_block_size(const FunctorType& f) {
-    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
-    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
-    //
-    // TODO check best option
-
-    unsigned n = Experimental::Impl::HIPTraits::WarpSize * 4;
-    while (n && static_cast<unsigned>(m_policy.space()
-                                          .impl_internal_space_instance()
-                                          ->m_maxShmemPerBlock) <
-                    hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
-                                                             WorkTag>(f, n)) {
-      n >>= 1;
-    }
-    return n;
-  }
+  virtual inline unsigned local_block_size(const FunctorType& f) = 0;
 
   inline void impl_execute() {
     const index_type nwork = m_policy.end() - m_policy.begin();
@@ -649,7 +637,11 @@ class ParallelScanHIPBase {
       const int gridMaxComputeCapability_2x = 0x01fff;
 
       const int block_size = static_cast<int>(local_block_size(m_functor));
-      KOKKOS_ASSERT(block_size > 0);
+      if (block_size == 0) {
+        Kokkos::Impl::throw_runtime_exception(
+            std::string("Kokkos::Impl::ParallelScan< HIP > could not find a "
+                        "valid execution configuration."));
+      }
 
       const int grid_max =
           std::min(block_size * block_size, gridMaxComputeCapability_2x);
@@ -674,15 +666,16 @@ class ParallelScanHIPBase {
       const int shmem = ValueTraits::value_size(m_functor) * (block_size + 2);
 
       m_final = false;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      // these ones are OK to be just the base because the specializations
+      // do not modify the kernel at all
+      using DriverType = ParallelScanHIPBase<FunctorType, Traits...>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
 
       m_final = true;
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelScanHIPBase,
-                                                    LaunchBounds>(
+      Kokkos::Experimental::Impl::hip_parallel_launch<DriverType, LaunchBounds>(
           *this, grid, block, shmem,
           m_policy.space().impl_internal_space_instance(),
           false);  // copy to device and execute
@@ -690,13 +683,17 @@ class ParallelScanHIPBase {
   }
 
   ParallelScanHIPBase(const FunctorType& arg_functor, const Policy& arg_policy)
-      : m_functor(arg_functor), m_policy(arg_policy) {}
+      : m_functor(arg_functor),
+        m_policy(arg_policy),
+        m_shared_memory_lock(m_policy.space()
+                                 .impl_internal_space_instance()
+                                 ->m_mutexSharedMemory) {}
 };
 
 template <class FunctorType, class... Traits>
 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
                    Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -706,6 +703,23 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   ParallelScan(const FunctorType& arg_functor,
                const typename Base::Policy& arg_policy)
       : Base(arg_functor, arg_policy) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType = ParallelScan<FunctorType, typename Base::Policy,
+                                    Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -713,7 +727,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 template <class FunctorType, class ReturnType, class... Traits>
 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                             ReturnType, Kokkos::Experimental::HIP>
-    : private ParallelScanHIPBase<FunctorType, Traits...> {
+    : public ParallelScanHIPBase<FunctorType, Traits...> {
  public:
   using Base = ParallelScanHIPBase<FunctorType, Traits...>;
   using Base::operator();
@@ -737,6 +751,24 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
                         const typename Base::Policy& arg_policy,
                         ReturnType& arg_returnvalue)
       : Base(arg_functor, arg_policy), m_returnvalue(arg_returnvalue) {}
+
+  inline unsigned local_block_size(const FunctorType& f) {
+    // blockDim.y must be power of two = 128 (2 warps) or 256 (4 warps) or
+    // 512 (8 warps) gridDim.x <= blockDim.y * blockDim.y
+
+    const auto& instance =
+        Base::m_policy.space().impl_internal_space_instance();
+    auto shmem_functor = [&f](unsigned n) {
+      return hip_single_inter_block_reduce_scan_shmem<false, FunctorType,
+                                                      typename Base::WorkTag>(
+          f, n);
+    };
+    using DriverType =
+        ParallelScanWithTotal<FunctorType, typename Base::Policy, ReturnType,
+                              Kokkos::Experimental::HIP>;
+    return Kokkos::Experimental::Impl::hip_get_preferred_blocksize<
+        DriverType, typename Base::LaunchBounds>(instance, shmem_functor);
+  }
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
index 96c3ff2a75..b794f5bc03 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -56,20 +56,20 @@
 
 namespace Kokkos {
 namespace Impl {
+
 template <typename... Properties>
 class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     : public PolicyTraits<Properties...> {
  public:
   using execution_policy = TeamPolicyInternal;
 
-  using traits = PolicyTraits<Properties...>;
+  using traits    = PolicyTraits<Properties...>;
+  using BlockType = Kokkos::Experimental::Impl::BlockType;
 
   template <typename ExecSpace, typename... OtherProperties>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -101,17 +101,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_max(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Max, closure_type>(f);
   }
 
   template <class FunctorType>
@@ -129,8 +121,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_max<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
-  inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/,
+  template <typename FunctorType, typename ReducerType>
+  inline int team_size_max(const FunctorType& f, const ReducerType&,
                            const ParallelReduceTag&) const {
     using closure_type =
         Impl::ParallelReduce<FunctorType, TeamPolicy<Properties...>,
@@ -141,17 +133,9 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   template <typename FunctorType>
   int team_size_recommended(FunctorType const& f, ParallelForTag const&) const {
     using closure_type =
-        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...> >;
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    int const block_size = ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-        FunctorType, typename traits::launch_bounds>(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double));
-    return block_size / impl_vector_length();
+        Impl::ParallelFor<FunctorType, TeamPolicy<Properties...>>;
+
+    return internal_team_size_common<BlockType::Preferred, closure_type>(f);
   }
 
   template <typename FunctorType>
@@ -169,7 +153,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
     return internal_team_size_recommended<closure_type>(f);
   }
 
-  template <class FunctorType, class ReducerType>
+  template <typename FunctorType, typename ReducerType>
   int team_size_recommended(FunctorType const& f, ReducerType const&,
                             ParallelReduceTag const&) const {
     using closure_type =
@@ -177,6 +161,7 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
                              ReducerType>;
     return internal_team_size_recommended<closure_type>(f);
   }
+
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   static int vector_length_max() {
@@ -211,7 +196,10 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -266,7 +254,8 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
           "space.");
 
     // Make sure total block size is permissible
-    if (m_team_size * m_vector_length > 1024) {
+    if (m_team_size * m_vector_length >
+        ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock) {
       Impl::throw_runtime_exception(
           std::string("Kokkos::TeamPolicy< HIP > the team size is too large. "
                       "Team size x vector length must be smaller than 1024."));
@@ -363,26 +352,84 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
   using member_type = Kokkos::Impl::HIPTeamMember;
 
  protected:
-  template <class ClosureType, class FunctorType, class BlockSizeCallable>
-  int internal_team_size_common(const FunctorType& f,
-                                BlockSizeCallable&& block_size_callable) const {
-    using closure_type = ClosureType;
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common(const FunctorType& f) const {
+    // FIXME_HIP: this could be unified with the
+    // internal_team_size_common_reduce
+    //            once we can turn c++17 constexpr on by default.
+    //            The problem right now is that we can't turn off the evaluation
+    //            of the functor_value_traits's valuesize / StaticValueSize
+
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double);
+    const int vector_length     = impl_vector_length();
+
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelFor< HIP > could not find a valid "
+                      "team size."));
+    }
+    return block_size / impl_vector_length();
+  }
+
+  template <BlockType BlockSize, class ClosureType, class FunctorType>
+  int internal_team_size_common_reduce(const FunctorType& f) const {
     using functor_value_traits =
         Impl::FunctorValueTraits<FunctorType, typename traits::work_tag>;
 
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        closure_type,
-        typename traits::launch_bounds>::get_hip_func_attributes();
-    const int block_size = std::forward<BlockSizeCallable>(block_size_callable)(
-        space().impl_internal_space_instance(), attr, f,
-        static_cast<size_t>(impl_vector_length()),
-        static_cast<size_t>(team_scratch_size(0)) + 2 * sizeof(double),
-        static_cast<size_t>(thread_scratch_size(0)) + sizeof(double) +
-            ((functor_value_traits::StaticValueSize != 0)
-                 ? 0
-                 : functor_value_traits::value_size(f)));
-    KOKKOS_ASSERT(block_size > 0);
+    const unsigned shmem_block  = team_scratch_size(0) + 2 * sizeof(double);
+    const unsigned shmem_thread = thread_scratch_size(0) + sizeof(double) +
+                                  ((functor_value_traits::StaticValueSize != 0)
+                                       ? 0
+                                       : functor_value_traits::value_size(f));
+    const int vector_length = impl_vector_length();
 
+    const auto functor = [&f, shmem_block, shmem_thread, vector_length](
+                             const hipFuncAttributes& attr, int block_size) {
+      int functor_shmem =
+          ::Kokkos::Impl::FunctorTeamShmemSize<FunctorType>::value(
+              f, block_size / vector_length);
+      return shmem_block + shmem_thread * (block_size / vector_length) +
+             functor_shmem + attr.sharedSizeBytes;
+    };
+    int block_size;
+    // FIXME_HIP - could be if constexpr for c++17
+    if (BlockSize == BlockType::Max) {
+      block_size = ::Kokkos::Experimental::Impl::hip_get_max_team_blocksize<
+          ClosureType, typename traits::launch_bounds>(
+          space().impl_internal_space_instance(), functor);
+    } else {
+      block_size =
+          ::Kokkos::Experimental::Impl::hip_get_preferred_team_blocksize<
+              ClosureType, typename traits::launch_bounds>(
+              space().impl_internal_space_instance(), functor);
+    }
+
+    if (block_size == 0) {
+      Kokkos::Impl::throw_runtime_exception(
+          std::string("Kokkos::Impl::ParallelReduce< HIP > could not find a "
+                      "valid team size."));
+    }
     // Currently we require Power-of-2 team size for reductions.
     int p2 = 1;
     while (p2 <= block_size) p2 *= 2;
@@ -392,16 +439,13 @@ class TeamPolicyInternal<Kokkos::Experimental::HIP, Properties...>
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_max(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_max_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Max, ClosureType>(f);
   }
 
   template <class ClosureType, class FunctorType>
   int internal_team_size_recommended(const FunctorType& f) const {
-    return internal_team_size_common<ClosureType>(
-        f, ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-               FunctorType, typename traits::launch_bounds>);
+    return internal_team_size_common_reduce<BlockType::Preferred, ClosureType>(
+        f);
   }
 };
 
@@ -505,7 +549,11 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     dim3 const block(static_cast<int>(m_vector_size),
                      static_cast<int>(m_team_size), 1);
 
-    ::Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelFor, launch_bounds>(
+    using closure_type =
+        ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                    Kokkos::Experimental::HIP>;
+    ::Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
         *this, grid, block, shmem_size_total,
         m_policy.space().impl_internal_space_instance(),
         true);  // copy to device and execute
@@ -520,17 +568,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelFor, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : ::Kokkos::Experimental::Impl::hip_get_opt_block_size<
-                  FunctorType, launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelForTag());
 
     m_shmem_begin = (sizeof(double) * (m_team_size + 2));
     m_shmem_size =
@@ -556,23 +596,12 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     int const shmem_size_total = m_shmem_begin + m_shmem_size;
     if (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
         shmem_size_total) {
-      printf(
-          "%i %i\n",
-          m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock,
-          shmem_size_total);
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        static_cast<int>(
-            ::Kokkos::Experimental::Impl::hip_get_max_block_size<FunctorType,
-                                                                 launch_bounds>(
-                m_policy.space().impl_internal_space_instance(), attr,
-                arg_functor, arg_policy.impl_vector_length(),
-                arg_policy.team_scratch_size(0),
-                arg_policy.thread_scratch_size(0)) /
-            arg_policy.impl_vector_length())) {
+    size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(std::string(
           "Kokkos::Impl::ParallelFor< HIP > requested too large team size."));
     }
@@ -839,8 +868,11 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       }
       const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
 
-      Kokkos::Experimental::Impl::HIPParallelLaunch<ParallelReduce,
-                                                    launch_bounds>(
+      using closure_type =
+          ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
+                         ReducerType, Kokkos::Experimental::HIP>;
+      Kokkos::Experimental::Impl::hip_parallel_launch<closure_type,
+                                                      launch_bounds>(
           *this, grid, block, shmem_size_total,
           m_policy.space().impl_internal_space_instance(),
           true);  // copy to device and execute
@@ -890,17 +922,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
+    m_team_size = m_team_size >= 0 ? m_team_size
+                                   : arg_policy.team_size_recommended(
+                                         arg_functor, ParallelReduceTag());
 
     m_team_begin =
         UseShflReduction
@@ -958,8 +982,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                       "L0 scratch memory"));
     }
 
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
@@ -992,18 +1017,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_scratch_lock(m_policy.space()
                            .impl_internal_space_instance()
                            ->m_team_scratch_mutex) {
-    hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch<
-        ParallelReduce, launch_bounds>::get_hip_func_attributes();
-    m_team_size =
-        m_team_size >= 0
-            ? m_team_size
-            : Kokkos::Experimental::Impl::hip_get_opt_block_size<FunctorType,
-                                                                 launch_bounds>(
-                  m_policy.space().impl_internal_space_instance(), attr,
-                  m_functor, m_vector_size, m_policy.team_scratch_size(0),
-                  m_policy.thread_scratch_size(0)) /
-                  m_vector_size;
-
+    m_team_size = m_team_size >= 0
+                      ? m_team_size
+                      : arg_policy.team_size_recommended(arg_functor, reducer,
+                                                         ParallelReduceTag());
     m_team_begin =
         UseShflReduction
             ? 0
@@ -1046,7 +1063,6 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // upon team size.
 
     const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size;
-
     if ((!Kokkos::Impl::is_integral_power_of_two(m_team_size) &&
          !UseShflReduction) ||
         m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock <
@@ -1054,8 +1070,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > bad team size"));
     }
-    if (static_cast<int>(m_team_size) >
-        arg_policy.team_size_max(m_functor, m_reducer, ParallelReduceTag())) {
+
+    size_t max_size =
+        arg_policy.team_size_max(arg_functor, reducer, ParallelReduceTag());
+    if (static_cast<int>(m_team_size) > static_cast<int>(max_size)) {
       Kokkos::Impl::throw_runtime_exception(
           std::string("Kokkos::Impl::ParallelReduce< HIP > requested too "
                       "large team size."));
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
index 15ca089d14..e25ebe2ab3 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp
@@ -67,102 +67,32 @@ namespace {
 hipStream_t get_deep_copy_stream() {
   static hipStream_t s = nullptr;
   if (s == nullptr) {
-    HIP_SAFE_CALL(hipStreamCreate(&s));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&s));
   }
   return s;
 }
 }  // namespace
 
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
+void DeepCopyHIP(void* dst, void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
 }
 
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(hipMemcpy(dst, src, n, hipMemcpyDefault));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIPHostPinnedSpace, Kokkos::Experimental::HIP>::
-    DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
-             const void* src, size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
-      hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
-}
-
-DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-         Kokkos::Experimental::HIP>::DeepCopy(const Kokkos::Experimental::HIP&
-                                                  instance,
-                                              void* dst, const void* src,
-                                              size_t n) {
-  HIP_SAFE_CALL(
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      void const* src, size_t n) {
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpyAsync(dst, src, n, hipMemcpyDefault, instance.hip_stream()));
 }
 
 void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
   hipStream_t s = get_deep_copy_stream();
-  HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
-  HIP_SAFE_CALL(hipStreamSynchronize(s));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(dst, src, n, hipMemcpyDefault, s));
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      "Kokkos::Impl::DeepCopyAsyncHIP: Post Deep Copy Fence on Deep-Copy "
+      "stream",
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          DeepCopyResourceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(s)); });
 }
 
 }  // namespace Impl
@@ -171,6 +101,7 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) {
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 
 KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() {
@@ -188,6 +119,7 @@ KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) {
 }
 
 }  // namespace Kokkos
+#endif
 
 /*--------------------------------------------------------------------------*/
 /*--------------------------------------------------------------------------*/
@@ -283,7 +215,7 @@ void HIPSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr));
 }
 
 void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr,
@@ -307,7 +239,7 @@ void HIPHostPinnedSpace::impl_deallocate(
     Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr,
                                       reported_size);
   }
-  HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(arg_alloc_ptr));
 }
 
 }  // namespace Experimental
@@ -427,23 +359,42 @@ HIP::HIP()
       "HIP instance constructor");
 }
 
-HIP::HIP(hipStream_t const stream)
+HIP::HIP(hipStream_t const stream, bool manage_stream)
     : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) {
         ptr->finalize();
         delete ptr;
       }) {
   Impl::HIPInternal::singleton().verify_is_initialized(
       "HIP instance constructor");
-  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream);
+  m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream,
+                               manage_stream);
 }
 
 void HIP::print_configuration(std::ostream& s, const bool) {
   Impl::HIPInternal::singleton().print_configuration(s);
 }
 
-void HIP::impl_static_fence() { HIP_SAFE_CALL(hipDeviceSynchronize()); }
+uint32_t HIP::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+void HIP::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::HIP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); });
+}
+void HIP::impl_static_fence() {
+  impl_static_fence("Kokkos::HIP::impl_static_fence: Unnamed Static Fence");
+}
 
-void HIP::fence() const { m_space_instance->fence(); }
+void HIP::fence(const std::string& name) const {
+  m_space_instance->fence(name);
+}
+void HIP::fence() const {
+  fence("Kokkos::HIP::fence(): Unnamed Instance Fence");
+}
 
 hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; }
 
@@ -489,6 +440,9 @@ void HIPSpaceInitializer::finalize(const bool all_spaces) {
 void HIPSpaceInitializer::fence() {
   Kokkos::Experimental::HIP::impl_static_fence();
 }
+void HIPSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::HIP::impl_static_fence(name);
+}
 
 void HIPSpaceInitializer::print_configuration(std::ostream& msg,
                                               const bool detail) {
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
index fe52886ced..fb67a25c5e 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp
@@ -316,198 +316,6 @@ class HIPTeamMember {
 #endif
   }
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& reducer, int* const global_scratch_flags,
-                    void* const global_scratch_space, void* const shmem,
-                    int const shmem_size) {
-#ifdef __HIP_DEVICE_COMPILE__
-    using value_type   = typename ReducerType::value_type;
-    using pointer_type = value_type volatile*;
-
-    // Number of shared memory entries for the reduction:
-    const int nsh = shmem_size / sizeof(value_type);
-
-    // Number of HIP threads in the block, rank within the block
-    const int nid = blockDim.x * blockDim.y * blockDim.z;
-    const int tid =
-        threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
-
-    // Reduces within block using all available shared memory
-    // Contributes if it is the root "vector lane"
-
-    // wn == number of warps in the block
-    // wx == which lane within the warp
-    // wy == which warp within the block
-
-    const int wn = (nid + Experimental::Impl::HIPTraits::WarpIndexMask) >>
-                   Experimental::Impl::HIPTraits::WarpIndexShift;
-    const int wx = tid & Experimental::Impl::HIPTraits::WarpIndexMask;
-    const int wy = tid >> Experimental::Impl::HIPTraits::WarpIndexShift;
-
-    //------------------------
-    {  // Intra warp shuffle reduction from contributing HIP threads
-
-      value_type tmp(reducer.reference());
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      for (int i = warp_size; static_cast<int>(blockDim.x) <= (i >>= 1);) {
-        Experimental::Impl::in_place_shfl_down(reducer.reference(), tmp, i,
-                                               warp_size);
-
-        // Root of each vector lane reduces "thread" contribution
-        if (0 == threadIdx.x && wx < i) {
-          reducer.join(&tmp, reducer.data());
-        }
-      }
-
-      // Reduce across warps using shared memory.
-      // Number of warps may not be power of two.
-
-      __syncthreads();  // Wait before shared data write
-
-      // Number of shared memory entries for the reduction
-      // is at most one per warp
-      const int nentry = wn < nsh ? wn : nsh;
-
-      if (0 == wx && wy < nentry) {
-        // Root thread of warp 'wy' has warp's value to contribute
-        (reinterpret_cast<value_type*>(shmem))[wy] = tmp;
-      }
-
-      __syncthreads();  // Wait for write to be visible to block
-
-      // When more warps than shared entries
-      // then warps must take turns joining their contribution
-      // to the designated shared memory entry.
-      for (int i = nentry; i < wn; i += nentry) {
-        const int k = wy - i;
-
-        if (0 == wx && i <= wy && k < nentry) {
-          // Root thread of warp 'wy' has warp's value to contribute
-          reducer.join((reinterpret_cast<value_type*>(shmem)) + k, &tmp);
-        }
-
-        __syncthreads();  // Wait for write to be visible to block
-      }
-
-      // One warp performs the inter-warp reduction:
-
-      if (0 == wy) {
-        // Start fan-in at power of two covering nentry
-
-        for (int i = (1 << (warp_size - __clz(nentry - 1))); (i >>= 1);) {
-          const int k = wx + i;
-          if (wx < i && k < nentry) {
-            reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                         (reinterpret_cast<pointer_type>(shmem)) + k);
-            __threadfence_block();  // Wait for write to be visible to warp
-          }
-        }
-      }
-    }
-    //------------------------
-    {  // Write block's value to global_scratch_memory
-
-      int last_block = 0;
-
-      if (0 == wx) {
-        reducer.copy((reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         blockIdx.x * reducer.length(),
-                     reducer.data());
-
-        __threadfence();  // Wait until global write is visible.
-
-        last_block = static_cast<int>(gridDim.x) ==
-                     1 + Kokkos::atomic_fetch_add(global_scratch_flags, 1);
-
-        // If last block then reset count
-        if (last_block) *global_scratch_flags = 0;
-      }
-
-      // FIXME hip does not support __syncthreads_or so we need to do it by hand
-      // last_block = __syncthreads_or(last_block);
-
-      __shared__ int last_block_shared;
-      if (last_block) last_block_shared = last_block;
-      __threadfence_block();
-
-      if (!last_block_shared) return 0;
-    }
-    //------------------------
-    // Last block reads global_scratch_memory into shared memory.
-
-    const int nentry = nid < gridDim.x ? (nid < nsh ? nid : nsh)
-                                       : (gridDim.x < nsh ? gridDim.x : nsh);
-
-    // nentry = min( nid , nsh , gridDim.x )
-
-    // whole block reads global memory into shared memory:
-
-    if (tid < nentry) {
-      const int offset = tid * reducer.length();
-
-      reducer.copy(
-          (reinterpret_cast<pointer_type>(shmem)) + offset,
-          (reinterpret_cast<pointer_type>(global_scratch_space)) + offset);
-
-      for (int i = nentry + tid; i < static_cast<int>(gridDim.x); i += nentry) {
-        reducer.join((reinterpret_cast<pointer_type>(shmem)) + offset,
-                     (reinterpret_cast<pointer_type>(global_scratch_space)) +
-                         i * reducer.length());
-      }
-    }
-
-    __syncthreads();  // Wait for writes to be visible to block
-
-    if (0 == wy) {
-      // Iterate to reduce shared memory to single warp fan-in size
-
-      int constexpr warp_size =
-          ::Kokkos::Experimental::Impl::HIPTraits::WarpSize;
-      const int nreduce = warp_size < nentry ? warp_size : nentry;
-
-      if (wx < nreduce && nreduce < nentry) {
-        for (int i = nreduce + wx; i < nentry; i += nreduce) {
-          reducer.join(((pointer_type)shmem) + wx, ((pointer_type)shmem) + i);
-        }
-        __threadfence_block();  // Wait for writes to be visible to warp
-      }
-
-      // Start fan-in at power of two covering nentry
-
-      for (int i = (1 << (warp_size - __clz(nreduce - 1))); (i >>= 1);) {
-        const int k = wx + i;
-        if (wx < i && k < nreduce) {
-          reducer.join((reinterpret_cast<pointer_type>(shmem)) + wx,
-                       (reinterpret_cast<pointer_type>(shmem)) + k);
-          __threadfence_block();  // Wait for writes to be visible to warp
-        }
-      }
-
-      if (0 == wx) {
-        reducer.copy(reducer.data(), reinterpret_cast<pointer_type>(shmem));
-        return 1;
-      }
-    }
-    return 0;
-#else
-    (void)reducer;
-    (void)global_scratch_flags;
-    (void)global_scratch_space;
-    (void)shmem;
-    (void)shmem_size;
-    return 0;
-#endif
-  }
-
   //----------------------------------------
   // Private for the driver
 
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
index 910d5e52e6..d9cb66e11f 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp
@@ -191,6 +191,9 @@ void HPXSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void HPXSpaceInitializer::fence() { Kokkos::Experimental::HPX().fence(); }
+void HPXSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Experimental::HPX().fence(name);
+}
 
 void HPXSpaceInitializer::print_configuration(std::ostream &msg,
                                               const bool detail) {
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
index df09e026fd..7bb3ca5d00 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp
@@ -82,7 +82,9 @@ class TaskQueueSpecialization<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence(
+        "Kokkos::Impl::TaskQueueSpecialization<SimpleTask>::execute: fence "
+        "after task execution");
   }
 
   // Must provide task queue execution function
@@ -214,7 +216,7 @@ class TaskQueueSpecializationConstrained<
     task_queue.scheduler = &scheduler;
     Kokkos::Impl::dispatch_execute_task(&task_queue,
                                         Kokkos::Experimental::HPX());
-    Kokkos::Experimental::HPX().fence();
+    Kokkos::Experimental::HPX().fence()"Kokkos::Impl::TaskQueueSpecializationConstrained::execute: fence after task execution";
   }
 
   // Must provide task queue execution function
diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
index 527fe12ad9..d7e13e28f0 100644
--- a/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_WorkGraphPolicy.hpp
@@ -79,7 +79,9 @@ class ParallelFor<FunctorType, Kokkos::WorkGraphPolicy<Traits...>,
  public:
   void execute() const {
     dispatch_execute_task(this, m_policy.space());
-    m_policy.space().fence();
+    m_policy.space().fence(
+        "Kokkos::Experimental::Impl::HPX::ParallelFor<WorkGraphPolicy>: fence "
+        "after kernel execution");
   }
 
   void execute_task() const {
diff --git a/lib/kokkos/core/src/KokkosExp_InterOp.hpp b/lib/kokkos/core/src/KokkosExp_InterOp.hpp
new file mode 100644
index 0000000000..37c2088f88
--- /dev/null
+++ b/lib/kokkos/core/src/KokkosExp_InterOp.hpp
@@ -0,0 +1,147 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_EXP_INTEROP_HPP
+#define KOKKOS_CORE_EXP_INTEROP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_View.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+#include <type_traits>
+
+namespace Kokkos {
+namespace Impl {
+
+// ------------------------------------------------------------------ //
+//  this is used to convert
+//      Kokkos::Device<ExecSpace, MemSpace> to MemSpace
+//
+template <typename Tp>
+struct device_memory_space {
+  using type = Tp;
+};
+
+template <typename ExecT, typename MemT>
+struct device_memory_space<Kokkos::Device<ExecT, MemT>> {
+  using type = MemT;
+};
+
+template <typename Tp>
+using device_memory_space_t = typename device_memory_space<Tp>::type;
+
+// ------------------------------------------------------------------ //
+//  this is the impl version which takes a view and converts to python
+//  view type
+//
+template <typename, typename...>
+struct python_view_type_impl;
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT>, type_list<Types...>> {
+  using type = ViewT<ValueT, device_memory_space_t<Types>...>;
+};
+
+template <template <typename...> class ViewT, typename ValueT,
+          typename... Types>
+struct python_view_type_impl<ViewT<ValueT, Types...>>
+    : python_view_type_impl<ViewT<ValueT>,
+                            filter_type_list_t<is_default_memory_trait,
+                                               type_list<Types...>, false>> {};
+
+template <typename... T>
+using python_view_type_impl_t = typename python_view_type_impl<T...>::type;
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+namespace Kokkos {
+
+template <typename DataType, class... Properties>
+class DynRankView;
+
+namespace Impl {
+
+// Duplicate from the header file for DynRankView to avoid core depending on
+// containers.
+template <class>
+struct is_dyn_rank_view_dup : public std::false_type {};
+
+template <class D, class... P>
+struct is_dyn_rank_view_dup<Kokkos::DynRankView<D, P...>>
+    : public std::true_type {};
+
+}  // namespace Impl
+
+namespace Experimental {
+
+// ------------------------------------------------------------------ //
+//  this is used to extract the uniform type of a view
+//
+template <typename ViewT>
+struct python_view_type {
+  static_assert(
+      Kokkos::is_view<std::decay_t<ViewT>>::value ||
+          Kokkos::Impl::is_dyn_rank_view_dup<std::decay_t<ViewT>>::value,
+      "Error! python_view_type only supports Kokkos::View and "
+      "Kokkos::DynRankView");
+
+  using type =
+      Kokkos::Impl::python_view_type_impl_t<typename ViewT::array_type>;
+};
+
+template <typename ViewT>
+using python_view_type_t = typename python_view_type<ViewT>::type;
+
+template <typename Tp>
+auto as_python_type(Tp&& _v) {
+  using cast_type = python_view_type_t<Tp>;
+  return static_cast<cast_type>(std::forward<Tp>(_v));
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
index b7d8e62f69..dfae7451fc 100644
--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@@ -48,6 +48,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Layout.hpp>
+#include <Kokkos_Rank.hpp>
 #include <Kokkos_Array.hpp>
 #include <impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
@@ -78,22 +79,6 @@ struct default_inner_direction {
   static constexpr Iterate value = Iterate::Right;
 };
 
-// Iteration Pattern
-template <unsigned N, Iterate OuterDir = Iterate::Default,
-          Iterate InnerDir = Iterate::Default>
-struct Rank {
-  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
-  static_assert(N != 1u,
-                "Kokkos Error: rank 1 is not a multi-dimensional range");
-  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
-
-  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
-
-  static constexpr int rank                = N;
-  static constexpr Iterate outer_direction = OuterDir;
-  static constexpr Iterate inner_direction = InnerDir;
-};
-
 namespace Impl {
 // NOTE the comparison below is encapsulated to silent warnings about pointless
 // comparison of unsigned integer with zero
@@ -397,13 +382,18 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 namespace Kokkos {
 namespace Experimental {
-using Kokkos::Iterate;
-using Kokkos::MDRangePolicy;
-using Kokkos::Rank;
+using Iterate KOKKOS_DEPRECATED = Kokkos::Iterate;
+template <typename... Properties>
+using MDRangePolicy KOKKOS_DEPRECATED = Kokkos::MDRangePolicy<Properties...>;
+template <unsigned N, Kokkos::Iterate OuterDir = Kokkos::Iterate::Default,
+          Kokkos::Iterate InnerDir = Kokkos::Iterate::Default>
+using Rank KOKKOS_DEPRECATED = Kokkos::Rank<N, OuterDir, InnerDir>;
 }  // namespace Experimental
 }  // namespace Kokkos
+#endif
 
 #endif  // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
index 8cd60fa6ba..a47208e977 100644
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -69,6 +69,60 @@
 #define KOKKOS_ATOMIC_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#ifdef KOKKOS_ENABLE_OPENMPTARGET
+#define DESUL_HAVE_OPENMP_ATOMICS
+#endif
+#include <Kokkos_Atomics_Desul_Wrapper.hpp>
+#include <Kokkos_Atomics_Desul_Volatile_Wrapper.hpp>
+#include <impl/Kokkos_Utilities.hpp>
+
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally Non-Desul
+// versions are below
+namespace Kokkos {
+namespace Impl {
+using desul::MemoryOrderSeqCst;
+using desul::MemoryScopeDevice;
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_dec(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return desul::atomic_inc(const_cast<T*>(dest), desul::MemoryOrderSeqCst(),
+                           desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, const Kokkos::Impl::identity_t<T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return desul::atomic_exchange(const_cast<T*>(dest), val,
+                                desul::MemoryOrderSeqCst(),
+                                desul::MemoryScopeDevice());
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return desul::atomic_compare_exchange(dest, compare, val,
+                                        desul::MemoryOrderSeqCst(),
+                                        desul::MemoryScopeDevice());
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+#else
+
 #include <Kokkos_HostSpace.hpp>
 #include <impl/Kokkos_Traits.hpp>
 
@@ -326,4 +380,42 @@ inline const char* atomic_query_version() {
 
 //----------------------------------------------------------------------------
 
+// Helper functions for places where we really should have called SeqCst atomics
+// anyway These can go away when we call desul unconditionally
+namespace Kokkos {
+namespace Impl {
+struct MemoryOrderSeqCst {};
+struct MemoryScopeDevice {};
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_dec(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_decrement(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION void desul_atomic_inc(T* dest, MemoryOrderSeqCst,
+                                             MemoryScopeDevice) {
+  return Kokkos::atomic_increment(dest);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T
+desul_atomic_exchange(T* dest, Kokkos::Impl::identity_t<const T> val,
+                      MemoryOrderSeqCst, MemoryScopeDevice) {
+  return Kokkos::atomic_exchange(dest, val);
+}
+
+template <class T>
+KOKKOS_INLINE_FUNCTION T desul_atomic_compare_exchange(
+    T* dest, Kokkos::Impl::identity_t<const T> compare,
+    Kokkos::Impl::identity_t<const T> val, MemoryOrderSeqCst,
+    MemoryScopeDevice) {
+  return Kokkos::atomic_compare_exchange(dest, compare, val);
+}
+
+}  // namespace Impl
+}  // namespace Kokkos
+
+#endif /* !KOKKOS_ENABLE_IMPL_DESUL_ATOMICS */
 #endif /* KOKKOS_ATOMIC_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
new file mode 100644
index 0000000000..0bcb3ea388
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp
@@ -0,0 +1,189 @@
+#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+// clang-format off
+namespace Kokkos { 
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast<T*>(dest), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),val);
+  #else
+  return desul::atomic_fetch_add (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(volatile double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(const_cast<double*>(dest),-val);
+  #else
+  return desul::atomic_fetch_sub (const_cast<double*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_or  (const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast<T*>(dest),desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(const_cast<T*>(dest), val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) {
+  return desul::atomic_compare_exchange_strong(const_cast<T*>(dest),expected, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) {
+  return desul::atomic_compare_exchange(const_cast<T*>(dest),compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
new file mode 100644
index 0000000000..3a182a6a22
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp
@@ -0,0 +1,271 @@
+#ifndef KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#define KOKKOS_DESUL_ATOMICS_WRAPPER_HPP_
+#include <Kokkos_Macros.hpp>
+
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#include <desul/atomics.hpp>
+
+#include <impl/Kokkos_Atomic_Memory_Order.hpp>
+#include <impl/Kokkos_Volatile_Load.hpp>
+
+// clang-format off
+namespace Kokkos {
+
+// FIXME: These functions don't have any use/test in unit tests ...
+// ==========================================================
+inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; }
+
+#if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \
+    !defined(__CUDA_ARCH__)
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr, 0, 0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr, 1, 0)
+
+#else
+
+#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+// ============================================================
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { atomic_store(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+void memory_fence() {
+  desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), desul::MemoryScopeDevice());
+}
+
+KOKKOS_INLINE_FUNCTION
+void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), desul::MemoryScopeDevice()); }
+
+KOKKOS_INLINE_FUNCTION
+void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), desul::MemoryScopeDevice()); }
+
+// atomic_fetch_op
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_USE_DOUBLE_ATOMICADD
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_add(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,val);
+  #else
+  return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+
+KOKKOS_INLINE_FUNCTION
+double atomic_fetch_sub(double* const dest, double val) {
+  #ifdef __CUDA_ARCH__
+  return atomicAdd(dest,-val);
+  #else
+  return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+  #endif
+};
+#endif
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_or  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op_fetch
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch  (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_or_fetch  (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+
+// atomic_op
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_and yet so call fetch_and
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// FIXME: Desul doesn't have atomic_or yet so call fetch_or
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val)  { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+// Exchange
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice()); }
+
+template<class T> KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> expected, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  T expected_ref = expected;
+  return desul::atomic_compare_exchange_strong(dest, expected_ref, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+template<class T> KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t<const T> compare, desul::Impl::dont_deduce_this_parameter_t<const T> desired) {
+  return desul::atomic_compare_exchange(dest, compare, desired,
+                  desul::MemoryOrderRelaxed(), desul::MemoryScopeDevice());
+}
+
+namespace Impl {
+
+  template<class MemoryOrder>
+  struct KokkosToDesulMemoryOrder;
+
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_seq_cst_t> {
+    using type = desul::MemoryOrderSeqCst;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acquire_t> {
+    using type = desul::MemoryOrderAcquire;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_release_t> {
+    using type = desul::MemoryOrderRelease;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_acq_rel_t> {
+    using type = desul::MemoryOrderAcqRel;
+  };
+  template<>
+  struct KokkosToDesulMemoryOrder<memory_order_relaxed_t> {
+    using type = desul::MemoryOrderRelaxed;
+  };
+  template<class T, class MemOrderSuccess, class MemOrderFailure> KOKKOS_INLINE_FUNCTION
+  bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess, MemOrderFailure) {
+    return desul::atomic_compare_exchange_strong(dest, expected, desired,
+                  typename KokkosToDesulMemoryOrder<MemOrderSuccess>::type(),
+                  typename KokkosToDesulMemoryOrder<MemOrderFailure>::type(),
+                  desul::MemoryScopeDevice());
+
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  T atomic_load(const T* const src, MemoryOrder) {
+    return desul::atomic_load(src, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+  template<class T, class MemoryOrder>
+  KOKKOS_INLINE_FUNCTION
+  void atomic_store(T* const src, const T val, MemoryOrder) {
+    return desul::atomic_store(src, val, typename KokkosToDesulMemoryOrder<MemoryOrder>::type(), desul::MemoryScopeDevice());
+  }
+}
+
+}
+// clang-format on
+#endif  // KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 6578723fc8..466903ab7d 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -77,7 +77,7 @@ class
 
   //! Default constructor (initializes both real and imaginary parts to zero).
   KOKKOS_DEFAULTED_FUNCTION
-  complex() noexcept = default;
+  complex() = default;
 
   //! Copy constructor.
   KOKKOS_DEFAULTED_FUNCTION
@@ -150,11 +150,11 @@ class
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& imag() noexcept { return im_; }
+  constexpr RealType& imag() noexcept { return im_; }
 
   //! The real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 RealType& real() noexcept { return re_; }
+  constexpr RealType& real() noexcept { return re_; }
 
   //! The imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
@@ -166,41 +166,39 @@ class
 
   //! Set the imaginary part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void imag(RealType v) noexcept { im_ = v; }
+  constexpr void imag(RealType v) noexcept { im_ = v; }
 
   //! Set the real part of this complex number.
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  void real(RealType v) noexcept { re_ = v; }
+  constexpr void real(RealType v) noexcept { re_ = v; }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const complex<RealType>& src) noexcept {
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator+=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator+=(
       const RealType& src) noexcept {
     re_ += src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const complex<RealType>& src) noexcept {
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator-=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator-=(
       const RealType& src) noexcept {
     re_ -= src;
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const complex<RealType>& src) noexcept {
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
@@ -209,7 +207,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator*=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator*=(
       const RealType& src) noexcept {
     re_ *= src;
     im_ *= src;
@@ -217,7 +215,7 @@ class
   }
 
   // Conditional noexcept, just in case RType throws on divide-by-zero
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const complex<RealType>& y) noexcept(noexcept(RealType{} / RealType{})) {
     using Kokkos::Experimental::fabs;
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
@@ -244,8 +242,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14
-  KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const std::complex<RealType>& y) noexcept(noexcept(RealType{} /
                                                          RealType{})) {
     using Kokkos::Experimental::fabs;
@@ -272,7 +269,7 @@ class
     return *this;
   }
 
-  KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=(
+  constexpr KOKKOS_INLINE_FUNCTION complex& operator/=(
       const RealType& src) noexcept(noexcept(RealType{} / RealType{})) {
     re_ /= src;
     im_ /= src;
@@ -688,12 +685,24 @@ KOKKOS_INLINE_FUNCTION RealType imag(const complex<RealType>& x) noexcept {
   return x.imag();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> imag(
+    ArithmeticType) {
+  return ArithmeticType();
+}
+
 //! Real part of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION RealType real(const complex<RealType>& x) noexcept {
   return x.real();
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t<ArithmeticType> real(
+    ArithmeticType x) {
+  return x;
+}
+
 //! Constructs a complex number from magnitude and phase angle
 template <class T>
 KOKKOS_INLINE_FUNCTION complex<T> polar(const T& r, const T& theta = T()) {
@@ -733,36 +742,6 @@ KOKKOS_INLINE_FUNCTION complex<T> pow(const complex<T>& x,
   return x == T() ? T() : exp(y * log(x));
 }
 
-namespace Impl {
-// NOTE promote would also be useful for math functions
-template <class T, bool = std::is_integral<T>::value>
-struct promote {
-  using type = double;
-};
-template <class T>
-struct promote<T, false> {};
-template <>
-struct promote<long double> {
-  using type = long double;
-};
-template <>
-struct promote<double> {
-  using type = double;
-};
-template <>
-struct promote<float> {
-  using type = float;
-};
-template <class T>
-using promote_t = typename promote<T>::type;
-template <class T, class U>
-struct promote_2 {
-  using type = decltype(promote_t<T>() + promote_t<U>());
-};
-template <class T, class U>
-using promote_2_t = typename promote_2<T, U>::type;
-}  // namespace Impl
-
 template <class T, class U,
           class = std::enable_if_t<std::is_arithmetic<T>::value>>
 KOKKOS_INLINE_FUNCTION complex<Impl::promote_2_t<T, U>> pow(
@@ -816,6 +795,13 @@ KOKKOS_INLINE_FUNCTION complex<RealType> conj(
   return complex<RealType>(real(x), -imag(x));
 }
 
+template <class ArithmeticType>
+KOKKOS_INLINE_FUNCTION constexpr complex<Impl::promote_t<ArithmeticType>> conj(
+    ArithmeticType x) {
+  using type = Impl::promote_t<ArithmeticType>;
+  return complex<type>(x, -type());
+}
+
 //! Exponential of a complex number.
 template <class RealType>
 KOKKOS_INLINE_FUNCTION complex<RealType> exp(const complex<RealType>& x) {
diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp
index 2aba189487..97137387f2 100644
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@@ -180,20 +180,23 @@ KOKKOS_IMPL_IS_CONCEPT(work_item_property)
 
 namespace Impl {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility:
 
-using Kokkos::is_array_layout;
-using Kokkos::is_execution_policy;
-using Kokkos::is_execution_space;
-using Kokkos::is_memory_space;
-using Kokkos::is_memory_traits;
+template <typename T>
+using is_array_layout KOKKOS_DEPRECATED = Kokkos::is_array_layout<T>;
+template <typename T>
+using is_execution_policy KOKKOS_DEPRECATED = Kokkos::is_execution_policy<T>;
+template <typename T>
+using is_execution_space KOKKOS_DEPRECATED = Kokkos::is_execution_space<T>;
+template <typename T>
+using is_memory_space KOKKOS_DEPRECATED = Kokkos::is_memory_space<T>;
+template <typename T>
+using is_memory_traits KOKKOS_DEPRECATED = Kokkos::is_memory_traits<T>;
+#endif
 
 // Implementation concept:
 
-KOKKOS_IMPL_IS_CONCEPT(iteration_pattern)
-KOKKOS_IMPL_IS_CONCEPT(schedule_type)
-KOKKOS_IMPL_IS_CONCEPT(index_type)
-KOKKOS_IMPL_IS_CONCEPT(launch_bounds)
 KOKKOS_IMPL_IS_CONCEPT(thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member)
 KOKKOS_IMPL_IS_CONCEPT(graph_kernel)
@@ -330,42 +333,65 @@ struct is_space {
   // For backward compatibility, deprecated in favor of
   // Kokkos::Impl::HostMirror<S>::host_mirror_space
 
-  using host_memory_space = typename std::conditional<
+ private:
+  // The actual definitions for host_memory_space and host_execution_spaces are
+  // in do_not_use_host_memory_space and do_not_use_host_execution_space to be
+  // able to use them within this class without deprecation warnings.
+  using do_not_use_host_memory_space = std::conditional_t<
       std::is_same<memory_space, Kokkos::HostSpace>::value
 #if defined(KOKKOS_ENABLE_CUDA)
           || std::is_same<memory_space, Kokkos::CudaUVMSpace>::value ||
           std::is_same<memory_space, Kokkos::CudaHostPinnedSpace>::value
-#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
+#elif defined(KOKKOS_ENABLE_HIP)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::HIPHostPinnedSpace>::value
+#elif defined(KOKKOS_ENABLE_SYCL)
+          || std::is_same<memory_space,
+                          Kokkos::Experimental::SYCLSharedUSMSpace>::value ||
+          std::is_same<memory_space,
+                       Kokkos::Experimental::SYCLHostUSMSpace>::value
+#endif
       ,
-      memory_space, Kokkos::HostSpace>::type;
+      memory_space, Kokkos::HostSpace>;
 
+  using do_not_use_host_execution_space = std::conditional_t<
 #if defined(KOKKOS_ENABLE_CUDA)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Cuda>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET)
-  using host_execution_space = typename std::conditional<
-      std::is_same<execution_space, Kokkos::Experimental::OpenMPTarget>::value,
-      Kokkos::DefaultHostExecutionSpace, execution_space>::type;
-#else
-  using host_execution_space = execution_space;
-#endif
+      std::is_same<execution_space, Kokkos::Cuda>::value ||
+#elif defined(KOKKOS_ENABLE_HIP)
+      std::is_same<execution_space, Kokkos::Experimental::HIP>::value ||
+#elif defined(KOKKOS_ENABLE_SYCL)
+      std::is_same<execution_space, Kokkos::Experimental::SYCL>::value ||
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
+      std::is_same<execution_space,
+                   Kokkos::Experimental::OpenMPTarget>::value ||
 #endif
+          false,
+      Kokkos::DefaultHostExecutionSpace, execution_space>;
 
-  using host_mirror_space = typename std::conditional<
-      std::is_same<execution_space, host_execution_space>::value &&
-          std::is_same<memory_space, host_memory_space>::value,
-      T, Kokkos::Device<host_execution_space, host_memory_space>>::type;
+ public:
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+  using host_memory_space KOKKOS_DEPRECATED = do_not_use_host_memory_space;
+  using host_execution_space KOKKOS_DEPRECATED =
+      do_not_use_host_execution_space;
+  using host_mirror_space KOKKOS_DEPRECATED = std::conditional_t<
+      std::is_same<execution_space, do_not_use_host_execution_space>::value &&
+          std::is_same<memory_space, do_not_use_host_memory_space>::value,
+      T,
+      Kokkos::Device<do_not_use_host_execution_space,
+                     do_not_use_host_memory_space>>;
+#endif
 };
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 // For backward compatibility
 
 namespace Impl {
 
-using Kokkos::is_space;
+template <typename T>
+using is_space KOKKOS_DEPRECATED = Kokkos::is_space<T>;
 
 }
+#endif
 
 }  // namespace Kokkos
 
@@ -485,13 +511,18 @@ struct SpaceAccessibility {
 
 }  // namespace Kokkos
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
-using Kokkos::SpaceAccessibility;  // For backward compatibility
+// For backward compatibility
+template <typename AccessSpace, typename MemorySpace>
+using SpaceAccessibility KOKKOS_DEPRECATED =
+    Kokkos::SpaceAccessibility<AccessSpace, MemorySpace>;
 
-}
+}  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 //----------------------------------------------------------------------------
 
diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
index a27d5f0e47..a68a3ea75f 100644
--- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
@@ -47,6 +47,7 @@
 #include <string>
 #include <Kokkos_Parallel.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
+#include <Kokkos_Layout.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -544,13 +545,11 @@ void view_copy(const ExecutionSpace& space, const DstType& dst,
 
   enum {
     ExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, src_memory_space>::accessible
   };
   enum {
     ExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<ExecutionSpace,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecutionSpace, dst_memory_space>::accessible
   };
 
   if (!(ExecCanAccessSrc && ExecCanAccessDst)) {
@@ -624,14 +623,14 @@ void view_copy(const DstType& dst, const SrcType& src) {
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   if (!DstExecCanAccessSrc && !SrcExecCanAccessDst) {
@@ -1254,6 +1253,98 @@ struct ViewRemap<DstType, SrcType, ExecSpace, 8> {
   }
 };
 
+template <typename ExecutionSpace, class DT, class... DP>
+inline void contiguous_fill(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType     = View<DT, DP...>;
+  using ViewTypeFlat = Kokkos::View<
+      typename ViewType::value_type*, Kokkos::LayoutRight,
+      Kokkos::Device<typename ViewType::execution_space,
+                     typename std::conditional<ViewType::Rank == 0,
+                                               typename ViewType::memory_space,
+                                               Kokkos::AnonymousSpace>::type>,
+      Kokkos::MemoryTraits<0>>;
+
+  ViewTypeFlat dst_flat(dst.data(), dst.size());
+  if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int>(dst_flat, value,
+                                                    exec_space);
+  } else
+    Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, ExecutionSpace,
+                           ViewTypeFlat::Rank, int64_t>(dst_flat, value,
+                                                        exec_space);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset {
+  ZeroMemset(const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(exec_space, dst, value);
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename ViewTraits<DT, DP...>::const_value_type& value) {
+    contiguous_fill(ExecutionSpace(), dst, value);
+  }
+};
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<ExecutionSpace, DT, DP...>(exec_space, dst, value);
+  else
+    contiguous_fill(exec_space, dst, value);
+}
+
+template <typename ExecutionSpace, class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const ExecutionSpace& exec_space, const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  contiguous_fill(exec_space, dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  if (Impl::is_zero_byte(value))
+    ZeroMemset<exec_space_type, DT, DP...>(dst, value);
+  else
+    contiguous_fill(exec_space_type(), dst, value);
+}
+
+template <class DT, class... DP>
+inline std::enable_if_t<!(
+    std::is_trivial<typename ViewTraits<DT, DP...>::const_value_type>::value &&
+    std::is_trivially_copy_assignable<
+        typename ViewTraits<DT, DP...>::const_value_type>::value)>
+contiguous_fill_or_memset(
+    const View<DT, DP...>& dst,
+    typename ViewTraits<DT, DP...>::const_value_type& value) {
+  using ViewType        = View<DT, DP...>;
+  using exec_space_type = typename ViewType::execution_space;
+
+  contiguous_fill(exec_space_type(), dst, value);
+}
 }  // namespace Impl
 
 /** \brief  Deep copy a value from Host memory into a view.  */
@@ -1276,38 +1367,23 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar copy, fence because destination is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence");
   static_assert(std::is_same<typename ViewType::non_const_value_type,
                              typename ViewType::value_type>::value,
                 "deep_copy requires non-const type");
 
-  // If contiguous we can simply do a 1D flat loop
+  // If contiguous we can simply do a 1D flat loop or use memset
   if (dst.span_is_contiguous()) {
-    using ViewTypeFlat = Kokkos::View<
-        typename ViewType::value_type*, Kokkos::LayoutRight,
-        Kokkos::Device<typename ViewType::execution_space,
-                       typename std::conditional<
-                           ViewType::Rank == 0, typename ViewType::memory_space,
-                           Kokkos::AnonymousSpace>::type>,
-        Kokkos::MemoryTraits<0>>;
-
-    ViewTypeFlat dst_flat(dst.data(), dst.size());
-    if (dst.span() < static_cast<size_t>(std::numeric_limits<int>::max())) {
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int>(dst_flat, value,
-                                                      exec_space_type());
-    } else
-      Kokkos::Impl::ViewFill<ViewTypeFlat, Kokkos::LayoutRight, exec_space_type,
-                             ViewTypeFlat::Rank, int64_t>(dst_flat, value,
-                                                          exec_space_type());
-    Kokkos::fence();
+    Impl::contiguous_fill_or_memset(dst, value);
+    Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1362,7 +1438,7 @@ inline void deep_copy(
                              exec_space_type, ViewType::Rank, int>(
           dst, value, exec_space_type());
   }
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar copy, post copy fence");
 
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1393,7 +1469,7 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: copy into scalar, src is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1439,18 +1515,19 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: scalar to scalar copy, both pointers null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
     return;
   }
 
-  Kokkos::fence();
+  Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, pre copy fence");
   if (dst.data() != src.data()) {
     Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
         dst.data(), src.data(), sizeof(value_type));
-    Kokkos::fence();
+    Kokkos::fence("Kokkos::deep_copy: scalar to scalar copy, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -1522,7 +1599,9 @@ inline void deep_copy(
 
       Kokkos::Impl::throw_runtime_exception(message);
     }
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to null "
+        "argument");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1531,14 +1610,14 @@ inline void deep_copy(
 
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Checking for Overlapping Views.
@@ -1549,7 +1628,9 @@ inline void deep_copy(
   if (((std::ptrdiff_t)dst_start == (std::ptrdiff_t)src_start) &&
       ((std::ptrdiff_t)dst_end == (std::ptrdiff_t)src_end) &&
       (dst.span_is_contiguous() && src.span_is_contiguous())) {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, fence due to same "
+        "spans");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -1620,16 +1701,22 @@ inline void deep_copy(
       ((dst_type::rank < 7) || (dst.stride_6() == src.stride_6())) &&
       ((dst_type::rank < 8) || (dst.stride_7() == src.stride_7()))) {
     const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre view equality "
+        "check");
     if ((void*)dst.data() != (void*)src.data()) {
       Kokkos::Impl::DeepCopy<dst_memory_space, src_memory_space>(
           dst.data(), src.data(), nbytes);
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::deep_copy: copy between contiguous views, post deep copy "
+          "fence");
     }
   } else {
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, pre copy fence");
     Impl::view_copy(dst, src);
-    Kokkos::fence();
+    Kokkos::fence(
+        "Kokkos::deep_copy: copy between contiguous views, post copy fence");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2031,7 +2118,10 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy(
 template <class TeamType, class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const TeamType& team, const View<DT, DP...>& dst,
-    typename ViewTraits<DT, DP...>::const_value_type& value) {
+    typename ViewTraits<DT, DP...>::const_value_type& value,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
+        nullptr) {
   Kokkos::parallel_for(Kokkos::TeamThreadRange(team, dst.span()),
                        [&](const int& i) { dst.data()[i] = value; });
 }
@@ -2039,7 +2129,10 @@ void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
 template <class DT, class... DP>
 void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous(
     const View<DT, DP...>& dst,
-    typename ViewTraits<DT, DP...>::const_value_type& value) {
+    typename ViewTraits<DT, DP...>::const_value_type& value,
+    typename std::enable_if<std::is_same<
+        typename ViewTraits<DT, DP...>::specialize, void>::value>::type* =
+        nullptr) {
   for (size_t i = 0; i < dst.span(); ++i) {
     dst.data()[i] = value;
   }
@@ -2418,9 +2511,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        Kokkos::Impl::SpaceAccessibility<
+        Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2437,7 +2530,9 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence("Kokkos::deep_copy: scalar copy on space, dst data is null");
+  } else if (dst.span_is_contiguous()) {
+    Impl::contiguous_fill_or_memset(space, dst, value);
   } else {
     using ViewTypeUniform = typename std::conditional<
         View<DT, DP...>::Rank == 0,
@@ -2458,9 +2553,9 @@ inline void deep_copy(
     const ExecSpace& space, const View<DT, DP...>& dst,
     typename ViewTraits<DT, DP...>::const_value_type& value,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
-        !Kokkos::Impl::SpaceAccessibility<
+        !Kokkos::SpaceAccessibility<
             ExecSpace,
             typename ViewTraits<DT, DP...>::memory_space>::accessible>::type* =
         nullptr) {
@@ -2477,17 +2572,23 @@ inline void deep_copy(
         "(none)", &value, dst.span() * sizeof(typename dst_traits::value_type));
   }
   if (dst.data() == nullptr) {
-    space.fence();
+    space.fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, dst data is null");
   } else {
-    space.fence();
-    using ViewTypeUniform = typename std::conditional<
-        View<DT, DP...>::Rank == 0,
-        typename View<DT, DP...>::uniform_runtime_type,
-        typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+    space.fence("Kokkos::deep_copy: scalar-to-view copy on space, pre copy");
     using fill_exec_space = typename dst_traits::memory_space::execution_space;
-    Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
-                           fill_exec_space>(dst, value, fill_exec_space());
-    fill_exec_space().fence();
+    if (dst.span_is_contiguous()) {
+      Impl::contiguous_fill_or_memset(fill_exec_space(), dst, value);
+    } else {
+      using ViewTypeUniform = typename std::conditional<
+          View<DT, DP...>::Rank == 0,
+          typename View<DT, DP...>::uniform_runtime_type,
+          typename View<DT, DP...>::uniform_runtime_nomemspace_type>::type;
+      Kokkos::Impl::ViewFill<ViewTypeUniform, typename dst_traits::array_layout,
+                             fill_exec_space>(dst, value, fill_exec_space());
+    }
+    fill_exec_space().fence(
+        "Kokkos::deep_copy: scalar-to-view copy on space, fence after fill");
   }
   if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
     Kokkos::Profiling::endDeepCopy();
@@ -2501,7 +2602,7 @@ inline void deep_copy(
     typename ViewTraits<ST, SP...>::non_const_value_type& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize,
                      void>::value>::type* = nullptr) {
   using src_traits       = ViewTraits<ST, SP...>;
@@ -2517,7 +2618,8 @@ inline void deep_copy(
   }
 
   if (src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-scalar copy on space, src data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2538,7 +2640,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) == unsigned(0) &&
@@ -2562,7 +2664,8 @@ inline void deep_copy(
   }
 
   if (dst.data() == nullptr && src.data() == nullptr) {
-    exec_space.fence();
+    exec_space.fence(
+        "Kokkos::deep_copy: view-to-view copy on space, data is null");
     if (Kokkos::Tools::Experimental::get_callbacks().end_deep_copy != nullptr) {
       Kokkos::Profiling::endDeepCopy();
     }
@@ -2588,7 +2691,7 @@ inline void deep_copy(
     const ExecSpace& exec_space, const View<DT, DP...>& dst,
     const View<ST, SP...>& src,
     typename std::enable_if<(
-        Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+        Kokkos::is_execution_space<ExecSpace>::value &&
         std::is_same<typename ViewTraits<DT, DP...>::specialize, void>::value &&
         std::is_same<typename ViewTraits<ST, SP...>::specialize, void>::value &&
         (unsigned(ViewTraits<DT, DP...>::rank) != 0 ||
@@ -2662,21 +2765,19 @@ inline void deep_copy(
 
   enum {
     ExecCanAccessSrcDst =
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         dst_memory_space>::accessible &&
-        Kokkos::Impl::SpaceAccessibility<ExecSpace,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<ExecSpace, dst_memory_space>::accessible &&
+        Kokkos::SpaceAccessibility<ExecSpace, src_memory_space>::accessible
   };
   enum {
     DstExecCanAccessSrc =
-        Kokkos::Impl::SpaceAccessibility<dst_execution_space,
-                                         src_memory_space>::accessible
+        Kokkos::SpaceAccessibility<dst_execution_space,
+                                   src_memory_space>::accessible
   };
 
   enum {
     SrcExecCanAccessDst =
-        Kokkos::Impl::SpaceAccessibility<src_execution_space,
-                                         dst_memory_space>::accessible
+        Kokkos::SpaceAccessibility<src_execution_space,
+                                   dst_memory_space>::accessible
   };
 
   // Error out for non-identical overlapping views.
@@ -2757,9 +2858,13 @@ inline void deep_copy(
       using cpy_exec_space =
           typename std::conditional<DstExecCanAccessSrc, dst_execution_space,
                                     src_execution_space>::type;
-      exec_space.fence();
+      exec_space.fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, pre "
+          "copy");
       Impl::view_copy(cpy_exec_space(), dst, src);
-      cpy_exec_space().fence();
+      cpy_exec_space().fence(
+          "Kokkos::deep_copy: view-to-view noncontiguous copy on space, post "
+          "copy");
     } else {
       Kokkos::Impl::throw_runtime_exception(
           "deep_copy given views that would require a temporary allocation");
@@ -2777,6 +2882,19 @@ inline void deep_copy(
 
 namespace Kokkos {
 
+namespace Impl {
+template <typename ViewType>
+bool size_mismatch(const ViewType& view, unsigned int max_extent,
+                   const size_t new_extents[8]) {
+  for (unsigned int dim = 0; dim < max_extent; ++dim)
+    if (new_extents[dim] != view.extent(dim)) {
+      return true;
+    }
+  return false;
+}
+
+}  // namespace Impl
+
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
@@ -2798,67 +2916,6 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2866,11 +2923,17 @@ resize(Kokkos::View<T, P...>& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(v.label(), n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
@@ -2895,67 +2958,6 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only resize managed views");
 
-  // Fix #904 by checking dimensions before actually resizing.
-  //
-  // Rank is known at compile time, so hopefully the compiler will
-  // remove branches that are compile-time false.  The upcoming "if
-  // constexpr" language feature would make this certain.
-  if (view_type::Rank == 1 && n0 == static_cast<size_t>(v.extent(0))) {
-    return;
-  }
-  if (view_type::Rank == 2 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1))) {
-    return;
-  }
-  if (view_type::Rank == 3 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2))) {
-    return;
-  }
-  if (view_type::Rank == 4 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3))) {
-    return;
-  }
-  if (view_type::Rank == 5 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4))) {
-    return;
-  }
-  if (view_type::Rank == 6 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5))) {
-    return;
-  }
-  if (view_type::Rank == 7 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6))) {
-    return;
-  }
-  if (view_type::Rank == 8 && n0 == static_cast<size_t>(v.extent(0)) &&
-      n1 == static_cast<size_t>(v.extent(1)) &&
-      n2 == static_cast<size_t>(v.extent(2)) &&
-      n3 == static_cast<size_t>(v.extent(3)) &&
-      n4 == static_cast<size_t>(v.extent(4)) &&
-      n5 == static_cast<size_t>(v.extent(5)) &&
-      n6 == static_cast<size_t>(v.extent(6)) &&
-      n7 == static_cast<size_t>(v.extent(7))) {
-    return;
-  }
-  // If Kokkos ever supports Views of rank > 8, the above code won't
-  // be incorrect, because avoiding reallocation in resize() is just
-  // an optimization.
-
   // TODO (mfh 27 Jun 2017) If the old View has enough space but just
   // different dimensions (e.g., if the product of the dimensions,
   // including extra space for alignment, will not change), then
@@ -2963,19 +2965,64 @@ resize(const I& arg_prop, Kokkos::View<T, P...>& v,
   // reallocates if any of the dimensions change, even if the old View
   // has enough space.
 
-  view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
-                      n0, n1, n2, n3, n4, n5, n6, n7);
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+  if (sizeMismatch) {
+    view_type v_resized(view_alloc(v.label(), std::forward<const I>(arg_prop)),
+                        n0, n1, n2, n3, n4, n5, n6, n7);
 
-  v = v_resized;
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    // This fence really ought to look for an execution space in
+    // arg_prop, and just fence that if there is one
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
 }
 
 /** \brief  Resize a view with copying old data to new data at the corresponding
  * indices. */
 template <class T, class... P>
-inline void resize(Kokkos::View<T, P...>& v,
-                   const typename Kokkos::View<T, P...>::array_layout& layout) {
+inline std::enable_if_t<
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutLeft>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutRight>::value ||
+    std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                 Kokkos::LayoutStride>::value ||
+    is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
+  using view_type = Kokkos::View<T, P...>;
+
+  static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
+                "Can only resize managed views");
+
+  if (v.layout() != layout) {
+    view_type v_resized(v.label(), layout);
+
+    Kokkos::Impl::ViewRemap<view_type, view_type>(v_resized, v);
+    Kokkos::fence("Kokkos::resize(View)");
+
+    v = v_resized;
+  }
+}
+
+// FIXME User-provided (custom) layouts are not required to have a comparison
+// operator. Hence, there is no way to check if the requested layout is actually
+// the same as the existing one.
+template <class T, class... P>
+inline std::enable_if_t<
+    !(std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutLeft>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutRight>::value ||
+      std::is_same<typename Kokkos::View<T, P...>::array_layout,
+                   Kokkos::LayoutStride>::value ||
+      is_layouttiled<typename Kokkos::View<T, P...>::array_layout>::value)>
+resize(Kokkos::View<T, P...>& v,
+       const typename Kokkos::View<T, P...>::array_layout& layout) {
   using view_type = Kokkos::View<T, P...>;
 
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
@@ -3009,10 +3056,16 @@ realloc(Kokkos::View<T, P...>& v,
   static_assert(Kokkos::ViewTraits<T, P...>::is_managed,
                 "Can only realloc managed views");
 
-  const std::string label = v.label();
+  const size_t new_extents[8] = {n0, n1, n2, n3, n4, n5, n6, n7};
+  const bool sizeMismatch = Impl::size_mismatch(v, v.rank_dynamic, new_extents);
 
-  v = view_type();  // Deallocate first, if the only view to allocation
-  v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  if (sizeMismatch) {
+    const std::string label = v.label();
+
+    v = view_type();  // Deallocate first, if the only view to allocation
+    v = view_type(label, n0, n1, n2, n3, n4, n5, n6, n7);
+  } else
+    Kokkos::deep_copy(v, typename view_type::value_type{});
 }
 
 /** \brief  Resize a view with discarding old data. */
@@ -3209,7 +3262,8 @@ create_mirror_view_and_copy(
         Impl::MirrorViewType<Space, T, P...>::is_same_memspace>::type* =
         nullptr) {
   (void)name;
-  fence();  // same behavior as deep_copy(src, src)
+  fence(
+      "Kokkos::create_mirror_view_and_copy: fence before returning src view");  // same behavior as deep_copy(src, src)
   return src;
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index c3771ab393..60e748589d 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -59,6 +59,7 @@
 #include <Kokkos_LogicalSpaces.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_MathematicalSpecialFunctions.hpp>
 #include <Kokkos_MemoryPool.hpp>
 #include <Kokkos_Array.hpp>
 #include <Kokkos_View.hpp>
@@ -74,6 +75,7 @@
 #include <iosfwd>
 #include <map>
 #include <memory>
+#include <vector>
 
 //----------------------------------------------------------------------------
 
@@ -121,6 +123,7 @@ class ExecSpaceManager {
   void initialize_spaces(const Kokkos::InitArguments& args);
   void finalize_spaces(const bool all_spaces);
   void static_fence();
+  void static_fence(const std::string&);
   void print_configuration(std::ostream& msg, const bool detail);
   static ExecSpaceManager& get_instance();
 };
@@ -184,6 +187,7 @@ void push_finalize_hook(std::function<void()> f);
 void finalize_all();
 
 void fence();
+void fence(const std::string&);
 
 /** \brief Print "Bill of Materials" */
 void print_configuration(std::ostream&, const bool detail = false);
@@ -274,6 +278,44 @@ class ScopeGuard {
 
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Experimental {
+// Partitioning an Execution Space: expects space and integer arguments for
+// relative weight
+//   Customization point for backends
+//   Default behavior is to return the passed in instance
+template <class ExecSpace, class... Args>
+std::vector<ExecSpace> partition_space(ExecSpace space, Args...) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+  std::vector<ExecSpace> instances(sizeof...(Args));
+  for (int s = 0; s < int(sizeof...(Args)); s++) instances[s] = space;
+  return instances;
+}
+
+template <class ExecSpace, class T>
+std::vector<ExecSpace> partition_space(ExecSpace space,
+                                       std::vector<T>& weights) {
+  static_assert(is_execution_space<ExecSpace>::value,
+                "Kokkos Error: partition_space expects an Execution Space as "
+                "first argument");
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  std::vector<ExecSpace> instances(weights.size());
+  for (int s = 0; s < int(weights.size()); s++) instances[s] = space;
+  return instances;
+}
+}  // namespace Experimental
+}  // namespace Kokkos
+
 #include <Kokkos_Crs.hpp>
 #include <Kokkos_WorkGraphPolicy.hpp>
 // Including this in Kokkos_Parallel_Reduce.hpp led to a circular dependency
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index fe7eba3f6e..a610ee76df 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -53,7 +53,9 @@
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Utilities.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #include <Kokkos_MasterLock.hpp>
+#endif
 
 //----------------------------------------------------------------------------
 // Have assumed a 64bit build (8byte pointers) throughout the code base.
@@ -238,7 +240,8 @@ class LogicalMemorySpace;
 
 namespace Kokkos {
 void fence();
-}
+void fence(const std::string &);
+}  // namespace Kokkos
 
 //----------------------------------------------------------------------------
 
@@ -250,9 +253,13 @@ class View;
 namespace Impl {
 
 template <class DstSpace, class SrcSpace,
-          class ExecutionSpace = typename DstSpace::execution_space>
+          class ExecutionSpace = typename DstSpace::execution_space,
+          class Enable         = void>
 struct DeepCopy;
 
+template <typename ExecutionSpace, class DT, class... DP>
+struct ZeroMemset;
+
 template <class ViewType, class Layout = typename ViewType::array_layout,
           class ExecSpace = typename ViewType::execution_space,
           int Rank = ViewType::Rank, typename iType = int64_t>
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index 1a10500b19..897402d376 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -179,7 +179,9 @@ class GetCrsTransposeCounts {
     const closure_type closure(*this,
                                policy_type(0, index_type(in.entries.size())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::GetCrsTransposeCounts::GetCrsTransposeCounts: fence "
+        "after functor execution");
   }
 };
 
@@ -261,7 +263,9 @@ class FillCrsTransposeEntries {
     using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
     const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
     closure.execute();
-    execution_space().fence();
+    execution_space().fence(
+        "Kokkos::Impl::FillCrsTransposeEntries::FillCrsTransposeEntries: fence "
+        "after functor execution");
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
index 7a218120bb..c5a6b0f7d7 100644
--- a/lib/kokkos/core/src/Kokkos_Cuda.hpp
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -55,13 +55,13 @@
 
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_CudaSpace.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>  // CUDA_SAFE_CALL
 
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 #include <impl/Kokkos_HostSharedPtr.hpp>
 
@@ -184,8 +184,10 @@ class Cuda {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -199,7 +201,7 @@ class Cuda {
 
   Cuda();
 
-  Cuda(cudaStream_t stream);
+  Cuda(cudaStream_t stream, bool manage_stream = false);
 
   //--------------------------------------------------------------------------
   //! \name Device-specific functions
@@ -246,7 +248,7 @@ class Cuda {
   inline Impl::CudaInternal* impl_internal_space_instance() const {
     return m_space_instance.get();
   }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::CudaInternal> m_space_instance;
@@ -271,9 +273,28 @@ class CudaSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool all_spaces) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Cuda, DT, DP...> {
+  ZeroMemset(const Kokkos::Cuda& exec_space_instance,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space_instance.cuda_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_CUDA_SAFE_CALL(
+        cudaMemset(dst.data(), 0,
+                   dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
index e10fae93c7..910a8b2d74 100644
--- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -70,6 +70,12 @@ extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(bool);
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_cuda_type_space : public std::false_type {};
+
+}  // namespace Impl
 
 /** \brief  Cuda on-device memory management */
 
@@ -119,10 +125,12 @@ class CudaSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return m_name; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which Cuda device
@@ -130,6 +138,10 @@ class CudaSpace {
   static constexpr const char* m_name = "Cuda";
   friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
 };
+
+template <>
+struct Impl::is_cuda_type_space<CudaSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -151,9 +163,11 @@ class CudaUVMSpace {
   /** \brief  If UVM capability is available */
   static bool available();
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  CudaUVMSpace specific routine */
   KOKKOS_DEPRECATED static int number_of_allocations();
+#endif
 
   /*--------------------------------*/
 
@@ -209,6 +223,9 @@ class CudaUVMSpace {
   static constexpr const char* m_name = "CudaUVM";
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaUVMSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -271,6 +288,9 @@ class CudaHostPinnedSpace {
   /*--------------------------------*/
 };
 
+template <>
+struct Impl::is_cuda_type_space<CudaHostPinnedSpace> : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -411,338 +431,107 @@ struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> {
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyCuda(void* dst, const void* src, size_t n);
+void DeepCopyAsyncCuda(const Cuda& instance, void* dst, const void* src,
+                       size_t n);
 void DeepCopyAsyncCuda(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<CudaSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Cuda&, void* dst, const void* src, size_t);
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaUVMSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<HostSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Cuda,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyCuda(dst, src, n); }
   DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
+    DeepCopyAsyncCuda(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, Cuda> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-  DeepCopy(const Cuda& instance, void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(instance, dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<MemSpace1, MemSpace2, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
+                                 is_cuda_type_space<MemSpace2>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, HostSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<MemSpace, HostSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
+                std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
+                                 !std::is_same<ExecutionSpace, Cuda>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
+    DeepCopyCuda(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncCuda(dst, src, n);
   }
-};
 
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaUVMSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<CudaSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<CudaHostPinnedSpace, HostSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaUVMSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, CudaSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, CudaHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, HostSpace, Cuda>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncCuda(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp
new file mode 100644
index 0000000000..9e060b343e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_DETECTION_IDIOM_HPP
+#define KOKKOS_DETECTION_IDIOM_HPP
+
+#include <impl/Kokkos_Utilities.hpp>  // void_t
+#include <type_traits>
+
+// NOTE This header implements the detection idiom from Version 2 of the C++
+// Extensions for Library Fundamentals, ISO/IEC TS 19568:2017
+
+// I deliberately omitted detected_or which does not fit well with the rest
+// of the specification. In my opinion, it should be removed from the TS.
+
+namespace Kokkos {
+
+namespace Impl {
+// base class for nonesuch to inherit from so it is not an aggregate
+struct nonesuch_base {};
+
+// primary template handles all types not supporting the archetypal Op
+template <class Default, class /*AlwaysVoid*/, template <class...> class Op,
+          class... /*Args*/>
+struct detector {
+  using value_t = std::false_type;
+  using type    = Default;
+};
+
+// specialization recognizes and handles only types supporting Op
+template <class Default, template <class...> class Op, class... Args>
+struct detector<Default, void_t<Op<Args...>>, Op, Args...> {
+  using value_t = std::true_type;
+  using type    = Op<Args...>;
+};
+}  // namespace Impl
+
+struct nonesuch : private Impl::nonesuch_base {
+  ~nonesuch()               = delete;
+  nonesuch(nonesuch const&) = delete;
+  void operator=(nonesuch const&) = delete;
+};
+
+template <template <class...> class Op, class... Args>
+using is_detected =
+    typename Impl::detector<nonesuch, void, Op, Args...>::value_t;
+
+template <template <class...> class Op, class... Args>
+using detected_t = typename Impl::detector<nonesuch, void, Op, Args...>::type;
+
+template <class Default, template <class...> class Op, class... Args>
+using detected_or_t = typename Impl::detector<Default, void, Op, Args...>::type;
+
+template <class Expected, template <class...> class Op, class... Args>
+using is_detected_exact = std::is_same<Expected, detected_t<Op, Args...>>;
+
+template <class To, template <class...> class Op, class... Args>
+using is_detected_convertible =
+    std::is_convertible<detected_t<Op, Args...>, To>;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class...> class Op, class... Args>
+inline constexpr bool is_detected_v = is_detected<Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_exact_v =
+    is_detected_exact<Expected, Op, Args...>::value;
+
+template <class Expected, template <class...> class Op, class... Args>
+inline constexpr bool is_detected_convertible_v =
+    is_detected_convertible<Expected, Op, Args...>::value;
+#endif
+
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
index 55aed13670..c88c1ada14 100644
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -48,7 +48,6 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_Error.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_AnalyzePolicy.hpp>
 #include <Kokkos_Concepts.hpp>
 #include <typeinfo>
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index d0366b599c..f6cdb2ec46 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -287,7 +287,10 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace,
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, "
+        "Kokkos::Experimental::HBWSpace,ExecutionSpace::DeepCopy: fence before "
+        "copy");
     memcpy(dst, src, n);
   }
 };
@@ -297,7 +300,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, Kokkos::Experimental::HBWSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
@@ -307,7 +312,9 @@ struct DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<Kokkos::Experimental::HBWSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     memcpy(dst, src, n);
   }
 };
diff --git a/lib/kokkos/core/src/Kokkos_HIP.hpp b/lib/kokkos/core/src/Kokkos_HIP.hpp
index 33cf8321c8..09df4f2fed 100644
--- a/lib/kokkos/core/src/Kokkos_HIP.hpp
+++ b/lib/kokkos/core/src/Kokkos_HIP.hpp
@@ -54,7 +54,6 @@
 
 #include <Kokkos_HIP_Space.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <HIP/Kokkos_HIP_MDRangePolicy.hpp>
diff --git a/lib/kokkos/core/src/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
index 17bd681aa4..d20d533645 100644
--- a/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
+++ b/lib/kokkos/core/src/Kokkos_HIP_Space.hpp
@@ -58,6 +58,7 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_ScratchSpace.hpp>
+#include <HIP/Kokkos_HIP_Error.hpp>  // HIP_SAFE_CALL
 
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -67,6 +68,13 @@
 /*--------------------------------------------------------------------------*/
 
 namespace Kokkos {
+namespace Impl {
+
+template <typename T>
+struct is_hip_type_space : public std::false_type {};
+
+}  // namespace Impl
+
 namespace Experimental {
 /** \brief  HIP on-device memory management */
 
@@ -116,10 +124,12 @@ class HIPSpace {
   /**\brief Return Name of the MemorySpace */
   static constexpr const char* name() { return "HIP"; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   /*--------------------------------*/
   /** \brief  Error reporting for HostSpace attempt to access HIPSpace */
   KOKKOS_DEPRECATED static void access_error();
   KOKKOS_DEPRECATED static void access_error(const void* const);
+#endif
 
  private:
   int m_device;  ///< Which HIP device
@@ -129,6 +139,11 @@ class HIPSpace {
 };
 
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPSpace> : public std::true_type {
+};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -188,6 +203,11 @@ class HIPHostPinnedSpace {
   /*--------------------------------*/
 };
 }  // namespace Experimental
+
+template <>
+struct Impl::is_hip_type_space<Experimental::HIPHostPinnedSpace>
+    : public std::true_type {};
+
 }  // namespace Kokkos
 
 /*--------------------------------------------------------------------------*/
@@ -268,174 +288,116 @@ struct MemorySpaceAccess<Kokkos::Experimental::HIPHostPinnedSpace,
 namespace Kokkos {
 namespace Impl {
 
+void DeepCopyHIP(void* dst, const void* src, size_t n);
+void DeepCopyAsyncHIP(const Kokkos::Experimental::HIP& instance, void* dst,
+                      const void* src, size_t n);
 void DeepCopyAsyncHIP(void* dst, const void* src, size_t n);
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::HIP,
+                std::enable_if_t<is_hip_type_space<MemSpace1>::value &&
+                                 is_hip_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopyHIP(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::HIP& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncHIP(instance, dst, src, n);
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace,
-                ExecutionSpace> {
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace1>::value &&
+        is_hip_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIP>(
-        dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace, ExecutionSpace> {
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_hip_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::HIP>::value>> {
   inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
+    DeepCopyHIP(dst, src, n);
   }
 
   inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
                   size_t n) {
-    exec.fence();
+    exec.fence(fence_string());
     DeepCopyAsyncHIP(dst, src, n);
   }
-};
 
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIP> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::HIP&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                Kokkos::Experimental::HIPHostPinnedSpace, ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::HIPHostPinnedSpace, HostSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
-  }
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                ExecutionSpace> {
-  inline DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<HostSpace, Kokkos::Experimental::HIPHostPinnedSpace,
-                   Kokkos::Experimental::HIP>(dst, src, n);
-  }
-
-  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
-                  size_t n) {
-    exec.fence();
-    DeepCopyAsyncHIP(dst, src, n);
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
   }
 };
 }  // namespace Impl
@@ -536,7 +498,7 @@ class HIP {
   using scratch_memory_space = ScratchMemorySpace<HIP>;
 
   HIP();
-  HIP(hipStream_t stream);
+  HIP(hipStream_t stream, bool manage_stream = false);
 
   //@}
   //------------------------------------
@@ -558,8 +520,10 @@ class HIP {
    * until all dispatched functors on this device have completed.
    */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   hipStream_t hip_stream() const;
 
@@ -596,7 +560,7 @@ class HIP {
     return m_space_instance.get();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Kokkos::Impl::HostSharedPtr<Impl::HIPInternal> m_space_instance;
@@ -620,9 +584,28 @@ class HIPSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::HIP, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::HIP& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type),
+        exec_space.hip_stream()));
+  }
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    KOKKOS_IMPL_HIP_SAFE_CALL(
+        hipMemset(dst.data(), 0,
+                  dst.size() * sizeof(typename View<DT, DP...>::value_type)));
+  }
+};
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_HPX.hpp b/lib/kokkos/core/src/Kokkos_HPX.hpp
index 2100b49c11..236211864e 100644
--- a/lib/kokkos/core/src/Kokkos_HPX.hpp
+++ b/lib/kokkos/core/src/Kokkos_HPX.hpp
@@ -69,7 +69,6 @@
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_Tools.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -318,25 +317,50 @@ class HPX {
   }
 
   void impl_fence_instance() const {
-    if (hpx::threads::get_self_ptr() == nullptr) {
-      hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
-    } else {
-      impl_get_future().wait();
-    }
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_instance: Unnamed Instance "
+        "Fence");
+  }
+  void impl_fence_instance(const std::string &name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      if (hpx::threads::get_self_ptr() == nullptr) {
+        hpx::threads::run_as_hpx_thread([this]() { impl_get_future().wait(); });
+      } else {
+        impl_get_future().wait();
+      }
+    });
   }
 
   void impl_fence_all_instances() const {
-    hpx::util::yield_while(
-        []() { return m_active_parallel_region_count.load() != 0; });
+    impl_fence_instance(
+        "Kokkos::Experimental::HPX::impl_fence_all_instances: Unnamed Global "
+        "HPX Fence");
+  }
+  void impl_fence_all_instances(const std::string &namename) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event(name, *this, [&]() {
+      hpx::util::yield_while(
+          []() { return m_active_parallel_region_count.load() != 0; });
+    });
   }
 #endif
 
   void fence() const {
 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
     if (m_mode == instance_mode::global) {
-      impl_fence_all_instances();
+      impl_fence_all_instances(
+          "Kokkos::Experimental::HPX::fence: Unnamed Global HPX Fence");
     } else {
-      impl_fence_instance();
+      impl_fence_instance(
+          "Kokkos::Experimental::HPX::fence: Unnamed HPX Instance Fence");
+    }
+#endif
+  }
+  void fence(const std::string &name) const {
+#if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
+    if (m_mode == instance_mode::global) {
+      impl_fence_all_instances(name);
+    } else {
+      impl_fence_instance(name);
     }
 #endif
   }
@@ -464,6 +488,7 @@ class HPXSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments &args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string &) final;
   void print_configuration(std::ostream &msg, const bool detail) final;
 };
 
@@ -491,7 +516,9 @@ inline void dispatch_execute_task(Closure *closure,
   }
 
   if (force_synchronous) {
-    instance.fence();
+    instance.fence(
+        "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
+        "forced syncronizations");
   }
 }
 #else
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
index ba69fbad39..c96cf5fbbe 100644
--- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -299,6 +299,20 @@ namespace Kokkos {
 
 namespace Impl {
 
+template <class DT, class... DP>
+struct ZeroMemset<typename HostSpace::execution_space, DT, DP...> {
+  ZeroMemset(const typename HostSpace::execution_space&,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type& value)
+      : ZeroMemset(dst, value) {}
+
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    using ValueType = typename View<DT, DP...>::value_type;
+    std::memset(dst.data(), 0, sizeof(ValueType) * dst.size());
+  }
+};
+
 template <class ExecutionSpace>
 struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   DeepCopy(void* dst, const void* src, size_t n) {
@@ -306,9 +320,13 @@ struct DeepCopy<HostSpace, HostSpace, ExecutionSpace> {
   }
 
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence before copy");
     hostspace_parallel_deepcopy(dst, src, n);
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, HostSpace, "
+        "ExecutionSpace>::DeepCopy: fence after copy");
   }
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
index 778b4f0810..cfd77ea50f 100644
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -50,7 +50,6 @@
 
 #include <cstddef>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 namespace Kokkos {
 
@@ -89,6 +88,16 @@ struct LayoutLeft {
                                 size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                 size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutLeft& left, const LayoutLeft& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -123,6 +132,16 @@ struct LayoutRight {
                                  size_t N3 = 0, size_t N4 = 0, size_t N5 = 0,
                                  size_t N6 = 0, size_t N7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {}
+
+  friend bool operator==(const LayoutRight& left, const LayoutRight& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutRight& left, const LayoutRight& right) {
+    return !(left == right);
+  }
 };
 
 //----------------------------------------------------------------------------
@@ -184,6 +203,18 @@ struct LayoutStride {
                                   size_t S7 = 0)
       : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3,
                                                           S4, S5, S6, S7} {}
+
+  friend bool operator==(const LayoutStride& left, const LayoutStride& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank] ||
+          left.stride[rank] != right.stride[rank])
+        return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutStride& left, const LayoutStride& right) {
+    return !(left == right);
+  }
 };
 
 // ===================================================================================
@@ -229,18 +260,6 @@ struct LayoutTiled {
   static_assert(IsPowerOfTwo,
                 "LayoutTiled must be given power-of-two tile dimensions");
 
-#if 0
-  static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN1) ) &&
-                 (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
-                 (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
-               , "LayoutTiled must be given power-of-two tile dimensions" );
-#endif
-
   using array_layout = LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3,
                                    ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo>;
   static constexpr Iterate outer_pattern = OuterP;
@@ -270,6 +289,16 @@ struct LayoutTiled {
                                  size_t argN4 = 0, size_t argN5 = 0,
                                  size_t argN6 = 0, size_t argN7 = 0)
       : dimension{argN0, argN1, argN2, argN3, argN4, argN5, argN6, argN7} {}
+
+  friend bool operator==(const LayoutTiled& left, const LayoutTiled& right) {
+    for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank)
+      if (left.dimension[rank] != right.dimension[rank]) return false;
+    return true;
+  }
+
+  friend bool operator!=(const LayoutTiled& left, const LayoutTiled& right) {
+    return !(left == right);
+  }
 };
 
 }  // namespace Experimental
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 0d01853465..8d0fd925a2 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -53,11 +53,12 @@
  *  KOKKOS_ENABLE_HPX                 Kokkos::Experimental::HPX execution space
  *  KOKKOS_ENABLE_OPENMP              Kokkos::OpenMP execution space
  *  KOKKOS_ENABLE_OPENMPTARGET        Kokkos::Experimental::OpenMPTarget
- * execution space KOKKOS_ENABLE_HWLOC               HWLOC library is available.
+ *                                    execution space
+ *  KOKKOS_ENABLE_HIP                 Kokkos::Experimental::HIP execution space
+ *  KOKKOS_ENABLE_SYCL                Kokkos::Experimental::SYCL execution space
+ *  KOKKOS_ENABLE_HWLOC               HWLOC library is available.
  *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK  Insert array bounds checks, is expensive!
- *  KOKKOS_ENABLE_MPI                 Negotiate MPI/execution space
- * interactions. KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory
- * space.
+ *  KOKKOS_ENABLE_CUDA_UVM            Use CUDA UVM for Cuda memory space.
  */
 
 #ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
@@ -211,6 +212,11 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
+// FIXME Workaround for ICE with intel 17,18,19 in Trilinos
+#if (KOKKOS_COMPILER_INTEL <= 1900)
+#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+#endif
+
 // FIXME_SYCL
 #if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
@@ -220,11 +226,19 @@
 #define KOKKOS_MEMORY_ALIGNMENT 64
 #endif
 
+#if defined(_WIN32)
+#define KOKKOS_RESTRICT __restrict
+#else
 #define KOKKOS_RESTRICT __restrict__
+#endif
 
 #ifndef KOKKOS_IMPL_ALIGN_PTR
+#if defined(_WIN32)
+#define KOKKOS_IMPL_ALIGN_PTR(size) __declspec(align_value(size))
+#else
 #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
 #endif
+#endif
 
 #if (1700 > KOKKOS_COMPILER_INTEL)
 #error "Compiling with Intel version earlier than 17.0 is not supported."
@@ -507,24 +521,44 @@
 #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
+// FIXME_SYCL Tasks not implemented
 #elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_TASKDAG
 #endif
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
-#if (__CUDA_ARCH__)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-#endif
-#endif
-
 #define KOKKOS_INVALID_INDEX (~std::size_t(0))
 
 #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 #define KOKKOS_CONSTEXPR_14 constexpr
-#define KOKKOS_DEPRECATED [[deprecated]]
 #define KOKKOS_DEPRECATED_TRAILING_ATTRIBUTE
+#endif
+
+// Guard intel compiler version <= 1900
+// intel error #2651: attribute does not apply to any entity
+// using <deprecated_type> KOKKOS_DEPRECATED = ...
+#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) && \
+    (KOKKOS_COMPILER_INTEL > 1900)
+#define KOKKOS_DEPRECATED [[deprecated]]
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]]
+#else
+#define KOKKOS_DEPRECATED
+#define KOKKOS_DEPRECATED_WITH_COMMENT(comment)
+#endif
+
+#define KOKKOS_IMPL_STRINGIFY(x) #x
+#define KOKKOS_IMPL_TOSTRING(x) KOKKOS_IMPL_STRINGIFY(x)
+
+#ifdef _MSC_VER
+#define KOKKOS_IMPL_DO_PRAGMA(x) __pragma(x)
+#define KOKKOS_IMPL_WARNING(desc) \
+  KOKKOS_IMPL_DO_PRAGMA(message(  \
+      __FILE__ "(" KOKKOS_IMPL_TOSTRING(__LINE__) ") : warning: " #desc))
+#else
+#define KOKKOS_IMPL_DO_PRAGMA(x) _Pragma(#x)
+#define KOKKOS_IMPL_WARNING(desc) KOKKOS_IMPL_DO_PRAGMA(message(#desc))
+#endif
 
 // DJS 05/28/2019: Bugfix: Issue 2155
 // Use KOKKOS_ENABLE_CUDA_LDG_INTRINSIC to avoid memory leak in RandomAccess
@@ -541,7 +575,7 @@
 
 #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
      defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
-    !defined(KOKKOS_COMPILER_MSVC)
+    !defined(_WIN32)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
 #define KOKKOS_IMPL_ENABLE_CXXABI
 #endif
@@ -553,7 +587,8 @@
 #undef __CUDA_ARCH__
 #endif
 
-#if defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)
+#if (defined(KOKKOS_COMPILER_MSVC) && !defined(KOKKOS_COMPILER_CLANG)) || \
+    (defined(KOKKOS_COMPILER_INTEL) && defined(_WIN32))
 #define KOKKOS_THREAD_LOCAL __declspec(thread)
 #else
 #define KOKKOS_THREAD_LOCAL __thread
diff --git a/lib/kokkos/core/src/Kokkos_MasterLock.hpp b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
index 3c45e131a0..cbfbb92660 100644
--- a/lib/kokkos/core/src/Kokkos_MasterLock.hpp
+++ b/lib/kokkos/core/src/Kokkos_MasterLock.hpp
@@ -47,6 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -72,4 +74,6 @@ class MasterLock;
 }  // namespace Experimental
 }  // namespace Kokkos
 
+#endif
+
 #endif  // KOKKOS_MASTER_LOCK_HPP
diff --git a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
index 50223651e7..50fde82d77 100644
--- a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
+++ b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp
@@ -55,116 +55,224 @@
 #endif
 
 namespace Kokkos {
+
+namespace Impl {
+template <class T, bool = std::is_integral<T>::value>
+struct promote {
+  using type = double;
+};
+template <class T>
+struct promote<T, false> {};
+template <>
+struct promote<long double> {
+  using type = long double;
+};
+template <>
+struct promote<double> {
+  using type = double;
+};
+template <>
+struct promote<float> {
+  using type = float;
+};
+template <class T>
+using promote_t = typename promote<T>::type;
+template <class T, class U>
+struct promote_2 {
+  using type = decltype(promote_t<T>() + promote_t<U>());
+};
+template <class T, class U>
+using promote_2_t = typename promote_2<T, U>::type;
+}  // namespace Impl
+
 namespace Experimental {
 
 #if defined(KOKKOS_ENABLE_SYCL)
-#define NAMESPACE_MATH_FUNCTIONS sycl
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl
 #else
-#define NAMESPACE_MATH_FUNCTIONS std
+#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std
 #endif
 
-#define KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, RETURNTYPE, ARGTYPE) \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(ARGTYPE x) {                        \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                                    \
-    return FUNC(x);                                                          \
-  }
-
-#define KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, RETURNTYPE)              \
-  template <typename Integer,                                              \
-            typename = std::enable_if_t<std::is_integral<Integer>::value>> \
-  KOKKOS_INLINE_FUNCTION RETURNTYPE FUNC(Integer x) {                      \
-    return Kokkos::Experimental::FUNC(static_cast<double>(x));             \
-  }
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, TYPE) \
-  KOKKOS_INLINE_FUNCTION TYPE FUNC(TYPE x, TYPE y) {           \
-    using NAMESPACE_MATH_FUNCTIONS::FUNC;                      \
-    return FUNC(x, y);                                         \
-  }
-
 // NOTE long double overloads are not available on the device
 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
     defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+#else
+#define KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+#endif
 
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename = std::enable_if_t<                                     \
-                std::is_arithmetic<Arithmetic1>::value &&                    \
-                std::is_arithmetic<Arithmetic2>::value &&                    \
-                !std::is_same<Arithmetic1, long double>::value &&            \
-                !std::is_same<Arithmetic2, long double>::value>>             \
-  KOKKOS_INLINE_FUNCTION double FUNC(Arithmetic1 x, Arithmetic2 y) {         \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x) {                    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                     \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)   \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
-
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)  \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
-
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)             \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
-
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*)
-
-#else  // long double overloads are available
-
-#define KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)                         \
-  template <typename Arithmetic1, typename Arithmetic2,                      \
-            typename =                                                       \
-                std::enable_if_t<std::is_arithmetic<Arithmetic1>::value &&   \
-                                 std::is_arithmetic<Arithmetic2>::value>,    \
-            typename Promoted = std::conditional_t<                          \
-                std::is_same<Arithmetic1, long double>::value ||             \
-                    std::is_same<Arithmetic2, long double>::value,           \
-                long double, double>>                                        \
-  KOKKOS_INLINE_FUNCTION Promoted FUNC(Arithmetic1 x, Arithmetic2 y) {       \
-    return Kokkos::Experimental::FUNC(                                       \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic1>::value, \
-                                       double, Arithmetic1>>(x),             \
-        static_cast<std::conditional_t<std::is_integral<Arithmetic2>::value, \
-                                       double, Arithmetic2>>(y));            \
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(long double x) {                         \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
   }
 
-#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                               \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, float, float)             \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, double, double)           \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, long double, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, double)
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                               \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC(long double x, long double y) {    \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {                   \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  KOKKOS_INLINE_FUNCTION long double FUNC##l(long double x, long double y) { \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(x, y);                                                       \
+  }                                                                          \
+  template <class T1, class T2>                                              \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic<T1>::value &&   \
+                                              std::is_arithmetic<T2>::value, \
+                                          Kokkos::Impl::promote_2_t<T1, T2>> \
+  FUNC(T1 x, T2 y) {                                                         \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                      \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                        \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));         \
+  }
 
-#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, float)       \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, double)      \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(FUNC, bool, long double) \
-  KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL(FUNC, bool)
+#else  // long double overloads are not available
 
-#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                  \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, float)       \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, double)      \
-  KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT(FUNC, long double) \
-  KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC(FUNC)
+#define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC)                                 \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x) {                                \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x) {                             \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(x);                                                           \
+  }                                                                           \
+  template <class T>                                                          \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, double> \
+  FUNC(T x) {                                                                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                         \
+    return FUNC(static_cast<double>(x));                                      \
+  }
 
-#define KOKKOS_IMPL_MATH_NAN()                                        \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanf, float, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nan, double, char const*) \
-  KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT(nanl, long double, char const*)
+#define KOKKOS_IMPL_MATH_UNARY_PREDICATE(FUNC)                              \
+  KOKKOS_INLINE_FUNCTION bool FUNC(float x) {                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  KOKKOS_INLINE_FUNCTION bool FUNC(double x) {                              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(x);                                                         \
+  }                                                                         \
+  template <class T>                                                        \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_integral<T>::value, bool> \
+  FUNC(T x) {                                                               \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                       \
+    return FUNC(static_cast<double>(x));                                    \
+  }
+
+#define KOKKOS_IMPL_MATH_BINARY_FUNCTION(FUNC)                          \
+  KOKKOS_INLINE_FUNCTION float FUNC(float x, float y) {                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION double FUNC(double x, double y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y) {              \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(x, y);                                                  \
+  }                                                                     \
+  template <class T1, class T2>                                         \
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<                              \
+      std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value && \
+          !std::is_same<T1, long double>::value &&                      \
+          !std::is_same<T2, long double>::value,                        \
+      Kokkos::Impl::promote_2_t<T1, T2>>                                \
+  FUNC(T1 x, T2 y) {                                                    \
+    using Promoted = Kokkos::Impl::promote_2_t<T1, T2>;                 \
+    using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC;                   \
+    return FUNC(static_cast<Promoted>(x), static_cast<Promoted>(y));    \
+  }
 
 #endif
 
 // Basic operations
+KOKKOS_INLINE_FUNCTION int abs(int n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long abs(long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION long long abs(long long n) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(n);
+}
+KOKKOS_INLINE_FUNCTION float abs(float x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+KOKKOS_INLINE_FUNCTION double abs(double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double abs(long double x) {
+  using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::abs;
+  return abs(x);
+}
+#endif
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder)
@@ -172,7 +280,18 @@ KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax)
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(fdim)
 #ifndef KOKKOS_ENABLE_SYCL
-KOKKOS_IMPL_MATH_NAN()
+KOKKOS_INLINE_FUNCTION float nanf(char const* arg) { return ::nanf(arg); }
+KOKKOS_INLINE_FUNCTION double nan(char const* arg) { return ::nan(arg); }
+#if defined(KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS)
+KOKKOS_INLINE_FUNCTION long double nanl(char const* arg) { return ::nanl(arg); }
+#endif
+#else
+// FIXME_SYCL
+// sycl::nan does not follow the C/C++ standard library and takes an unsigned
+// integer as argument.  The current implementation does not attempt to convert
+// the character string arg into the quiet NaN value.
+KOKKOS_INLINE_FUNCTION float nanf(char const*) { return sycl::nan(0u); }
+KOKKOS_INLINE_FUNCTION double nan(char const*) { return sycl::nan(0ul); }
 #endif
 // Power functions
 KOKKOS_IMPL_MATH_BINARY_FUNCTION(pow)
@@ -211,6 +330,7 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(lgamma)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor)
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc)
+// FIXME_SYCL not available as of current SYCL specification v1.2.1
 #ifndef KOKKOS_ENABLE_SYCL
 KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint)
 #endif
@@ -219,14 +339,12 @@ KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf)
 KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan)
 
-#undef KOKKOS_IMPL_UNARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_UNARY_FUNCTION_INTEGRAL
-#undef KOKKOS_IMPL_BINARY_FUNCTION_FLOATING_POINT
-#undef KOKKOS_IMPL_BINARY_FUNCTION_ARITHMETIC
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE
+#undef KOKKOS_IMPL_MATH_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
 #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION
 #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE
 #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION
-#undef KOKKOS_IMPL_MATH_NAN
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000..7bcea91c86
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MathematicalSpecialFunctions.hpp
@@ -0,0 +1,1280 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+#define KOKKOS_MATHEMATICAL_SPECIAL_FUNCTIONS_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cmath>
+#include <algorithm>
+#include <type_traits>
+#include <Kokkos_MathematicalFunctions.hpp>
+#include <Kokkos_NumericTraits.hpp>
+#include <Kokkos_Complex.hpp>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace Kokkos {
+namespace Experimental {
+
+//! Compute exponential integral E1(x) (x > 0).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType expint1(RealType x) {
+  // This function is a conversion of the corresponding Fortran program in
+  // S. Zhang & J. Jin "Computation of Special Functions" (Wiley, 1996).
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::log;
+  using Kokkos::Experimental::pow;
+
+  RealType e1;
+
+  if (x < 0) {
+    e1 = -infinity<RealType>::value;
+  } else if (x == 0.0) {
+    e1 = infinity<RealType>::value;
+  } else if (x <= 1.0) {
+    e1         = 1.0;
+    RealType r = 1.0;
+    for (int k = 1; k <= 25; k++) {
+      RealType k_real = static_cast<RealType>(k);
+      r               = -r * k_real * x / pow(k_real + 1.0, 2.0);
+      e1              = e1 + r;
+      if (fabs(r) <= fabs(e1) * epsilon<RealType>::value) break;
+    }
+    e1 = -0.5772156649015328 - log(x) + x * e1;
+  } else {
+    int m       = 20 + static_cast<int>(80.0 / x);
+    RealType t0 = 0.0;
+    for (int k = m; k >= 1; k--) {
+      RealType k_real = static_cast<RealType>(k);
+      t0              = k_real / (1.0 + k_real / (x + t0));
+    }
+    e1 = exp(-x) * (1.0 / (x + t0));
+  }
+  return e1;
+}
+
+//! Compute error function erf(z) for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erf(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erf(-z)=-erf(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  //  Error condition: abs(z^2) > 670 is a fatal overflow error
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = accum * cz;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(Z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      cz              = -cz;
+      RealType er     = cz.real();
+      RealType ei     = cz.imag();
+      cz              = exp(er) * CmplxType(cos(ei), sin(ei));
+      CmplxType accum = zp * gnorm * cz;
+      cans            = 1.0 - accum / term;
+      if (z.real() < 0.0) cans = -cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex = exp(-x2);
+        w           = w * xp * fnorm * ex;
+        RealType cf = ex / pi;
+        s1          = cf * s1 + w;
+        s2          = cf * s2;
+        cans        = CmplxType(s1, s2);
+        if (z.real() < 0.0) cans = -cans;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum       = accum * gnorm / zp;
+        cz          = -cz;
+        RealType er = cz.real();
+        if (fabs(er) > 670.0) return CmplxType(inf, inf);
+        RealType ei = cz.imag();
+        cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+        cans        = 1.0 - accum * cz;
+        if (z.real() < 0.0) cans = -cans;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(z)=exp(z^2)*erfc(z)
+//! for z=cmplx(x,y).
+template <class RealType>
+KOKKOS_INLINE_FUNCTION Kokkos::complex<RealType> erfcx(
+    const Kokkos::complex<RealType>& z) {
+  // This function is a conversion of the corresponding Fortran program written
+  // by D.E. Amos, May,1974. D.E. Amos' revisions of Jan 86 incorporated by
+  // Ken Damrau on 27-Jan-1986 14:37:13
+  //
+  // Reference: NBS HANDBOOK OF MATHEMATICAL FUNCTIONS, AMS 55, By
+  //           M. ABRAMOWITZ AND I.A. STEGUN, December,1955.
+  // Summary:
+  //  If x < 0, z is replaced by -z and all computation is done in the right
+  //  half lane, except for z inside the circle abs(z)<=2, since
+  //  erfc(-z)=2-erfc(z). The regions for computation are divided as follows
+  //      (1)  abs(z)<=2 - Power series, NBS Handbook, p. 298
+  //      (2)  abs(z)>2 and x>1 - continued fraction, NBS Handbook, p. 298
+  //      (3)  abs(z)>2 and 0<=x<=1 and abs(y)<6 - series, NBS Handbook, p. 299
+  //      (4)  abs(z)>2 and 0<=x<=1 and abs(y)>=6 - asymtotic expansion
+  // Error condition: abs(z^2) > 670 is a fatal overflow error when x<0
+  using Kokkos::Experimental::cos;
+  using Kokkos::Experimental::epsilon;
+  using Kokkos::Experimental::exp;
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::isinf;
+  using Kokkos::Experimental::sin;
+
+  using CmplxType = Kokkos::complex<RealType>;
+
+  auto const inf = infinity<RealType>::value;
+  auto const tol = epsilon<RealType>::value;
+
+  const RealType fnorm = 1.12837916709551;
+  const RealType gnorm = 0.564189583547756;
+  const RealType eh    = 0.606530659712633;
+  const RealType ef    = 0.778800783071405;
+  // const RealType tol   = 1.0e-13;
+  const RealType pi = M_PI;
+
+  CmplxType cans;
+
+  if ((isinf(z.real())) && (z.real() > 0)) {
+    cans = CmplxType(0.0, 0.0);
+    return cans;
+  }
+  if ((isinf(z.real())) && (z.real() < 0)) {
+    cans = CmplxType(inf, inf);
+    return cans;
+  }
+
+  RealType az = Kokkos::abs(z);
+  if (az <= 2.0) {  // Series for abs(z)<=2.0
+    CmplxType cz    = z * z;
+    CmplxType accum = CmplxType(1.0, 0.0);
+    CmplxType term  = accum;
+    RealType ak     = 1.5;
+    for (int i = 1; i <= 35; i++) {
+      term  = term * cz / ak;
+      accum = accum + term;
+      if (Kokkos::abs(term) <= tol) break;
+      ak = ak + 1.0;
+    }
+    cz          = -cz;
+    RealType er = cz.real();
+    RealType ei = cz.imag();
+    accum       = accum * z * fnorm;
+    cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+    cans        = 1.0 / cz - accum;
+  }       // end (az <= 2.0)
+  else {  //(az > 2.0)
+    CmplxType zp = z;
+    if (z.real() < 0.0) zp = -z;
+    CmplxType cz = zp * zp;
+    RealType xp  = zp.real();
+    RealType yp  = zp.imag();
+    if (xp > 1.0) {
+      // continued fraction for erfc(z), abs(z)>2
+      int n          = static_cast<int>(100.0 / az + 5.0);
+      int fn         = n;
+      CmplxType term = cz;
+      for (int i = 1; i <= n; i++) {
+        RealType fnh = fn - 0.5;
+        term         = cz + (fnh * term) / (fn + term);
+        fn           = fn - 1;
+      }
+      cans = zp * gnorm / term;
+      if (z.real() >= 0.0) return cans;
+      if (Kokkos::abs(cz) > 670.0) return CmplxType(inf, inf);
+      ;
+      cz          = -cz;
+      RealType er = cz.real();
+      RealType ei = cz.imag();
+      cz          = exp(er) * CmplxType(cos(ei), sin(ei));
+      cz          = 1.0 / cz;
+      cans        = cz + cz - cans;
+    }       // end (xp > 1.0)
+    else {  //(xp <= 1.0)
+      if (fabs(yp) <
+          6.0) {  // Series (3) for abs(z)>2 and 0<=xp<=1 and abs(yp)<6
+        RealType s1   = 0.0;
+        RealType s2   = 0.0;
+        RealType x2   = xp * xp;
+        RealType fx2  = 4.0 * x2;
+        RealType tx   = xp + xp;
+        RealType xy   = xp * yp;
+        RealType sxyh = sin(xy);
+        RealType sxy  = sin(xy + xy);
+        RealType cxy  = cos(xy + xy);
+        RealType fn   = 1.0;
+        RealType fnh  = 0.5;
+        RealType ey   = exp(yp);
+        RealType en   = ey;
+        RealType ehn  = eh;
+        RealType un   = ef;
+        RealType vn   = 1.0;
+        for (int i = 1; i <= 50; i++) {
+          RealType ren = 1.0 / en;
+          RealType csh = en + ren;
+          RealType tm  = xp * csh;
+          RealType ssh = en - ren;
+          RealType tmp = fnh * ssh;
+          RealType rn  = tx - tm * cxy + tmp * sxy;
+          RealType ain = tm * sxy + tmp * cxy;
+          RealType cf  = un / (vn + fx2);
+          rn           = cf * rn;
+          ain          = cf * ain;
+          s1           = s1 + rn;
+          s2           = s2 + ain;
+          if ((fabs(rn) + fabs(ain)) < tol * (fabs(s1) + fabs(s2))) break;
+          un  = un * ehn * ef;
+          ehn = ehn * eh;
+          en  = en * ey;
+          vn  = vn + fn + fn + 1.0;
+          fnh = fnh + 0.5;
+          fn  = fn + 1.0;
+        }
+        s1 = s1 + s1;
+        s2 = s2 + s2;
+        if (z.real() == 0.0)
+          s2 = s2 + yp;
+        else {
+          s1 = s1 + sxyh * sxyh / xp;
+          s2 = s2 + sxy / tx;
+        }
+        // Power series for erf(xp), 0<=xp<=1
+        RealType w  = 1.0;
+        RealType ak = 1.5;
+        RealType tm = 1.0;
+        for (int i = 1; i <= 17; i++) {
+          tm = tm * x2 / ak;
+          w  = w + tm;
+          if (tm <= tol) break;
+          ak = ak + 1.0;
+        }
+        RealType ex   = exp(-x2);
+        w             = w * xp * fnorm * ex;
+        CmplxType rcz = CmplxType(cxy, sxy);
+        RealType y2   = yp * yp;
+        cz            = exp(x2 - y2) * rcz;
+        rcz           = exp(-y2) * rcz;
+        if (z.real() >= 0.0)
+          cans = cz * (1.0 - w) - rcz * CmplxType(s1, s2) / pi;
+        else
+          cans = cz * (1.0 + w) + rcz * CmplxType(s1, s2) / pi;
+      }       // end (abs(yp) < 6.0)
+      else {  //(abs(YP)>=6.0)
+        // Asymtotic expansion for 0<=xp<=1 and abs(yp)>=6
+        CmplxType rcz   = 0.5 / cz;
+        CmplxType accum = CmplxType(1.0, 0.0);
+        CmplxType term  = accum;
+        RealType ak     = 1.0;
+        for (int i = 1; i <= 35; i++) {
+          term  = -term * ak * rcz;
+          accum = accum + term;
+          if (Kokkos::abs(term) / Kokkos::abs(accum) <= tol) break;
+          ak = ak + 2.0;
+        }
+        accum = accum * gnorm / zp;
+        if (z.real() < 0.0) accum = -accum;
+        cans = accum;
+      }  // end (abs(YP)>=6.0)
+    }    // end (xp <= 1.0)
+  }      // end (az > 2.0)
+  return cans;
+}
+
+//! Compute scaled complementary error function erfcx(x)=exp(x^2)*erfc(x)
+//! for real x
+template <class RealType>
+KOKKOS_INLINE_FUNCTION RealType erfcx(RealType x) {
+  using CmplxType = Kokkos::complex<RealType>;
+  // Note: using erfcx(complex) for now
+  // TODO: replace with an implementation of erfcx(real)
+  CmplxType zin  = CmplxType(x, 0.0);
+  CmplxType zout = erfcx(zin);
+  return zout.real();
+}
+
+//! Compute Bessel function J0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  // Input :  z         --- Complex argument
+  //         joint_val --- Joint point of abs(z) separating small and large
+  //                       argument regions
+  //         bw_start  --- Starting point for backward recurrence
+  // Output:  cbj0      --- J0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+    }
+  }
+  return cbj0;
+}
+
+//! Compute Bessel function Y0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby0      --- Y0(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby0, cbj0;
+  const RealType pi    = M_PI;
+  const RealType el    = 0.57721566490153286060651209008240;
+  const RealType a[12] = {
+      -0.703125e-01,           0.112152099609375e+00,   -0.5725014209747314e+00,
+      0.6074042001273483e+01,  -0.1100171402692467e+03, 0.3038090510922384e+04,
+      -0.1188384262567832e+06, 0.6252951493434797e+07,  -0.4259392165047669e+09,
+      0.3646840080706556e+11,  -0.3833534661393944e+13, 0.4854014686852901e+15};
+  const RealType b[12] = {0.732421875e-01,        -0.2271080017089844e+00,
+                          0.1727727502584457e+01, -0.2438052969955606e+02,
+                          0.5513358961220206e+03, -0.1825775547429318e+05,
+                          0.8328593040162893e+06, -0.5006958953198893e+08,
+                          0.3836255180230433e+10, -0.3649010818849833e+12,
+                          0.4218971570284096e+14, -0.5827244631566907e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby0 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+    } else {  // Using asymptotic expansion (5.2.6) for |z|>joint_val
+              // (default:25)
+      CmplxType ct1 = z1 - 0.25 * pi;
+      CmplxType cp0 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.9)
+        cp0 = cp0 + a[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq0 = -0.125 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.10)
+        cq0 = cq0 + b[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj0         = cu * (cp0 * Kokkos::cos(ct1) - cq0 * Kokkos::sin(ct1));
+      cby0         = cu * (cp0 * Kokkos::sin(ct1) + cq0 * Kokkos::cos(ct1));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby0 = cby0 - 2.0 * ci * cbj0;
+        if (z.imag() >= 0.0) cby0 = cby0 + 2.0 * ci * cbj0;
+      }
+    }
+  }
+  return cby0;
+}
+
+//! Compute Bessel function J1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_j1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbj1      --- J1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::pow;
+
+  CmplxType cbj1;
+  const RealType pi     = M_PI;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbj1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj1 = cbj1 / cs0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+
+      if (real(z) < 0.0) {  // Apply (5.4.2)
+        cbj1 = -cbj1;
+      }
+    }
+  }
+  return cbj1;
+}
+
+//! Compute Bessel function Y1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_y1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // program CJYNB in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cby1      --- Y1(z)
+  using Kokkos::Experimental::fabs;
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cby1, cbj0, cbj1, cby0;
+  const RealType pi     = M_PI;
+  const RealType el     = 0.57721566490153286060651209008240;
+  const RealType a1[12] = {0.1171875e+00,          -0.144195556640625e+00,
+                           0.6765925884246826e+00, -0.6883914268109947e+01,
+                           0.1215978918765359e+03, -0.3302272294480852e+04,
+                           0.1276412726461746e+06, -0.6656367718817688e+07,
+                           0.4502786003050393e+09, -0.3833857520742790e+11,
+                           0.4011838599133198e+13, -0.5060568503314727e+15};
+  const RealType b1[12] = {
+      -0.1025390625e+00,       0.2775764465332031e+00,  -0.1993531733751297e+01,
+      0.2724882731126854e+02,  -0.6038440767050702e+03, 0.1971837591223663e+05,
+      -0.8902978767070678e+06, 0.5310411010968522e+08,  -0.4043620325107754e+10,
+      0.3827011346598605e+12,  -0.4406481417852278e+14, 0.6065091351222699e+16};
+
+  RealType r2p = 2.0 / pi;
+  RealType a0  = Kokkos::abs(z);
+  RealType y0  = fabs(z.imag());
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cby1 = -CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      CmplxType csu = CmplxType(0.0, 0.0);
+      CmplxType csv = CmplxType(0.0, 0.0);
+      CmplxType cf2 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0, ce;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf                    = 2.0 * (k + 1.0) / z * cf1 - cf2;
+        RealType tmp_exponent = static_cast<RealType>(k / 2);
+        if (k == 1) cbj1 = cf;
+        if (k == 0) cbj0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          if (y0 <= 1.0)
+            cbs = cbs + 2.0 * cf;
+          else
+            cbs = cbs + pow(-1.0, tmp_exponent) * 2.0 * cf;
+          csu = csu + pow(-1.0, tmp_exponent) * cf / k;
+        } else if (k > 1) {
+          csv = csv + pow(-1.0, tmp_exponent) * k / (k * k - 1.0) * cf;
+        }
+        cf2 = cf1;
+        cf1 = cf;
+      }
+      if (y0 <= 1.0)
+        cs0 = cbs + cf;
+      else
+        cs0 = (cbs + cf) / Kokkos::cos(z);
+      cbj0 = cbj0 / cs0;
+      ce   = Kokkos::log(z / 2.0) + el;
+      cby0 = r2p * (ce * cbj0 - 4.0 * csu / cs0);
+      cbj1 = cbj1 / cs0;
+      cby1 = (cbj1 * cby0 - 2.0 / (pi * z)) / cbj0;
+    } else {  // Using asymptotic expansion (5.2.5) for |z|>joint_val
+              // (default:25)
+      CmplxType ct2 = z1 - 0.75 * pi;
+      CmplxType cp1 = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.11)
+        cp1 = cp1 + a1[k - 1] * Kokkos::pow(z1, -2.0 * k);
+      }
+      CmplxType cq1 = 0.375 / z1;
+      for (int k = 1; k <= 12; k++) {  // Calculate (5.2.12)
+        cq1 = cq1 + b1[k - 1] * Kokkos::pow(z1, -2.0 * k - 1);
+      }
+      CmplxType cu = Kokkos::sqrt(r2p / z1);
+      cbj1         = cu * (cp1 * Kokkos::cos(ct2) - cq1 * Kokkos::sin(ct2));
+      cby1         = cu * (cp1 * Kokkos::sin(ct2) + cq1 * Kokkos::cos(ct2));
+
+      if (z.real() < 0.0) {  // Apply (5.4.2)
+        if (z.imag() < 0.0) cby1 = -(cby1 - 2.0 * ci * cbj1);
+        if (z.imag() >= 0.0) cby1 = -(cby1 + 2.0 * ci * cbj1);
+      }
+    }
+  }
+  return cby1;
+}
+
+//! Compute modified Bessel function I0(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i0(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi0      --- I0(z)
+  CmplxType cbi0;
+  const RealType pi    = M_PI;
+  const RealType a[12] = {0.125,
+                          7.03125e-2,
+                          7.32421875e-2,
+                          1.1215209960938e-1,
+                          2.2710800170898e-1,
+                          5.7250142097473e-1,
+                          1.7277275025845e0,
+                          6.0740420012735e0,
+                          2.4380529699556e1,
+                          1.1001714026925e2,
+                          5.5133589612202e2,
+                          3.0380905109224e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi0 = CmplxType(1.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi0         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi0 = cbi0 + a[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi0 = ca * cbi0;
+    }
+  }
+  return cbi0;
+}
+
+//! Compute modified Bessel function K0(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k0(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Purpose: Compute modified Bessel function K0(z) of the second kind of
+  //             order zero for a complex argument
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk0      --- K0(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk0 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (0.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk0 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk0 = cbk0 + ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk0 = cbk0 - ci * pi * cyl_bessel_i0<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk0;
+}
+
+//! Compute modified Bessel function I1(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_i1(const CmplxType& z,
+                                               const RealType& joint_val = 25,
+                                               const IntType& bw_start   = 70) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbi1      --- I1(z)
+  CmplxType cbi1;
+  const RealType pi    = M_PI;
+  const RealType b[12] = {-0.375,
+                          -1.171875e-1,
+                          -1.025390625e-1,
+                          -1.4419555664063e-1,
+                          -2.7757644653320e-1,
+                          -6.7659258842468e-1,
+                          -1.9935317337513,
+                          -6.8839142681099,
+                          -2.7248827311269e1,
+                          -1.2159789187654e2,
+                          -6.0384407670507e2,
+                          -3.3022722944809e3};
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbi1 = CmplxType(0.0, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:25)
+      CmplxType cbs = CmplxType(0.0, 0.0);
+      // CmplxType csk0 = CmplxType(0.0,0.0);
+      CmplxType cf0 = CmplxType(0.0, 0.0);
+      CmplxType cf1 = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 70)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        // if ((k == 2*(k/2)) && (k != 0)) {
+        //  csk0 = csk0+4.0*cf/static_cast<RealType>(k);
+        //}
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi1 = cbi1 * cs0;
+    } else {  // Using asymptotic expansion (6.2.1) for |z|>joint_val
+              // (default:25)
+      CmplxType ca = Kokkos::exp(z1) / Kokkos::sqrt(2.0 * pi * z1);
+      cbi1         = CmplxType(1.0, 0.0);
+      CmplxType zr = 1.0 / z1;
+      for (int k = 1; k <= 12; k++) {
+        cbi1 = cbi1 + b[k - 1] * Kokkos::pow(zr, 1.0 * k);
+      }
+      cbi1 = ca * cbi1;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      cbi1 = -cbi1;
+    }
+  }
+  return cbi1;
+}
+
+//! Compute modified Bessel function K1(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType, class RealType, class IntType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_k1(const CmplxType& z,
+                                               const RealType& joint_val = 9,
+                                               const IntType& bw_start   = 30) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CIKNB and CIK01 in S. Zhang & J. Jin "Computation of Special
+  // Functions" (Wiley, 1996).
+  //    Input :  z         --- Complex argument
+  //             joint_val --- Joint point of abs(z) separating small and large
+  //                           argument regions
+  //             bw_start  --- Starting point for backward recurrence
+  //    Output:  cbk1      --- K1(z)
+  using Kokkos::Experimental::infinity;
+  using Kokkos::Experimental::pow;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType cbk0, cbi0, cbk1, cbi1;
+  const RealType pi = M_PI;
+  const RealType el = 0.57721566490153286060651209008240;
+
+  RealType a0  = Kokkos::abs(z);
+  CmplxType ci = CmplxType(0.0, 1.0);
+  CmplxType z1 = z;
+
+  if (a0 < 1e-100) {  // Treat z=0 as a special case
+    cbk1 = CmplxType(inf, 0.0);
+  } else {
+    if (z.real() < 0.0) z1 = -z;
+    if (a0 <= joint_val) {  // Using backward recurrence for |z|<=joint_val
+                            // (default:9)
+      CmplxType cbs  = CmplxType(0.0, 0.0);
+      CmplxType csk0 = CmplxType(0.0, 0.0);
+      CmplxType cf0  = CmplxType(0.0, 0.0);
+      CmplxType cf1  = CmplxType(1e-100, 0.0);
+      CmplxType cf, cs0;
+      for (int k = bw_start; k >= 0; k--) {  // Backward recurrence (default:
+                                             // 30)
+        cf = 2.0 * (k + 1.0) * cf1 / z1 + cf0;
+        if (k == 1) cbi1 = cf;
+        if (k == 0) cbi0 = cf;
+        if ((k == 2 * (k / 2)) && (k != 0)) {
+          csk0 = csk0 + 4.0 * cf / static_cast<RealType>(k);
+        }
+        cbs = cbs + 2.0 * cf;
+        cf0 = cf1;
+        cf1 = cf;
+      }
+      cs0  = Kokkos::exp(z1) / (cbs - cf);
+      cbi0 = cbi0 * cs0;
+      cbi1 = cbi1 * cs0;
+      cbk0 = -(Kokkos::log(0.5 * z1) + el) * cbi0 + cs0 * csk0;
+      cbk1 = (1.0 / z1 - cbi1 * cbk0) / cbi0;
+    } else {  // Using asymptotic expansion (6.2.2) for |z|>joint_val
+              // (default:9)
+      CmplxType ca0  = Kokkos::sqrt(pi / (2.0 * z1)) * Kokkos::exp(-z1);
+      CmplxType cbkl = CmplxType(1.0, 0.0);
+      CmplxType cr   = CmplxType(1.0, 0.0);
+      for (int k = 1; k <= 30; k++) {
+        cr   = 0.125 * cr * (4.0 - pow(2.0 * k - 1.0, 2.0)) / (k * z1);
+        cbkl = cbkl + cr;
+      }
+      cbk1 = ca0 * cbkl;
+    }
+    if (z.real() < 0.0) {  // Apply (6.4.4)
+      if (z.imag() < 0.0)
+        cbk1 = -cbk1 - ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+      if (z.imag() >= 0.0)
+        cbk1 = -cbk1 + ci * pi * cyl_bessel_i1<CmplxType, RealType, IntType>(z);
+    }
+  }
+  return cbk1;
+}
+
+//! Compute Hankel function H10(z) of the first kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h10(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch10, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch10 = CmplxType(1.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch10 = cbj0 + ci * cby0;
+  } else {  //(z.imag() > 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch10 = 2.0 / (pi * ci) * cbk0;
+  }
+
+  return ch10;
+}
+
+//! Compute Hankel function H11(z) of the first kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h11(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch11, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch11 = CmplxType(0.0, -inf);
+  } else if (z.imag() <= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch11 = cbj1 + ci * cby1;
+  } else {  //(z.imag() > 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(-ci * z, 18.0, 70);
+    ch11 = -2.0 / pi * cbk1;
+  }
+
+  return ch11;
+}
+
+//! Compute Hankel function H20(z) of the second kind of order zero
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h20(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch20, cbk0, cbj0, cby0;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch20 = CmplxType(1.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj0 = cyl_bessel_j0<CmplxType, RealType, int>(z);
+    cby0 = cyl_bessel_y0<CmplxType, RealType, int>(z);
+    ch20 = cbj0 - ci * cby0;
+  } else {  //(z.imag() < 0.0)
+    cbk0 = cyl_bessel_k0<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch20 = 2.0 / pi * ci * cbk0;
+  }
+
+  return ch20;
+}
+
+//! Compute Hankel function H21(z) of the second kind of order one
+//! for a complex argument
+template <class CmplxType>
+KOKKOS_INLINE_FUNCTION CmplxType cyl_bessel_h21(const CmplxType& z) {
+  // This function is converted and modified from the corresponding Fortran
+  // programs CH12N in S. Zhang & J. Jin "Computation of Special Functions"
+  //(Wiley, 1996).
+  using RealType = typename CmplxType::value_type;
+  using Kokkos::Experimental::infinity;
+
+  auto const inf = infinity<RealType>::value;
+
+  CmplxType ch21, cbk1, cbj1, cby1;
+  const RealType pi = M_PI;
+  CmplxType ci      = CmplxType(0.0, 1.0);
+
+  if ((z.real() == 0.0) && (z.imag() == 0.0)) {
+    ch21 = CmplxType(0.0, inf);
+  } else if (z.imag() >= 0.0) {
+    cbj1 = cyl_bessel_j1<CmplxType, RealType, int>(z);
+    cby1 = cyl_bessel_y1<CmplxType, RealType, int>(z);
+    ch21 = cbj1 - ci * cby1;
+  } else {  //(z.imag() < 0.0)
+    cbk1 = cyl_bessel_k1<CmplxType, RealType, int>(ci * z, 18.0, 70);
+    ch21 = -2.0 / pi * cbk1;
+  }
+
+  return ch21;
+}
+
+}  // namespace Experimental
+}  // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 2cafac1aea..c814e5a22a 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -524,7 +524,7 @@ class MemoryPool {
     // Fast query clock register 'tic' to pseudo-randomize
     // the guess for which block within a superblock should
     // be claimed.  If not available then a search occurs.
-#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GEN)
+#if defined(KOKKOS_ENABLE_SYCL) && !defined(KOKKOS_ARCH_INTEL_GPU)
     const uint32_t block_id_hint = alloc_size;
 #else
     const uint32_t block_id_hint =
@@ -585,19 +585,6 @@ class MemoryPool {
               (uint64_t(sb_id) << m_sb_size_lg2)       // superblock memory
               + (uint64_t(result.first) << size_lg2);  // block memory
 
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , alloc_size
-        , sb_id
-        , sb_state 
-        , (1u << size_lg2)
-        , (1u << count_lg2)
-        , result.first 
-        , result.second );
-#endif
-
           break;  // Success
         }
       }
@@ -740,7 +727,8 @@ class MemoryPool {
 
     // Determine which superblock and block
     const ptrdiff_t d =
-        ((char *)p) - ((char *)(m_sb_state_array + m_data_offset));
+        static_cast<char *>(p) -
+        reinterpret_cast<char *>(m_sb_state_array + m_data_offset);
 
     // Verify contained within the memory pool's superblocks:
     const int ok_contains =
@@ -772,29 +760,10 @@ class MemoryPool {
         const int result = CB::release(sb_state_array, bit, block_state);
 
         ok_dealloc_once = 0 <= result;
-
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , sb_id
-        , (1u << block_size_lg2)
-        , (1u << (m_sb_size_lg2 - block_size_lg2))
-        , bit
-        , result );
-#endif
       }
     }
 
     if (!ok_contains || !ok_block_aligned || !ok_dealloc_once) {
-#if 0
-  printf( "  MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
-        , (uintptr_t)m_sb_state_array
-        , (uintptr_t)p
-        , int(ok_contains)
-        , int(ok_block_aligned)
-        , int(ok_dealloc_once) );
-#endif
       Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
     }
   }
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
index f23442b793..e3cee93e25 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -46,7 +46,6 @@
 #define KOKKOS_MEMORYTRAITS_HPP
 
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 
@@ -119,6 +118,15 @@ enum : unsigned {
   MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
 };
 
+// ------------------------------------------------------------------ //
+//  this identifies the default memory trait
+//
+template <typename Tp>
+struct is_default_memory_trait : std::false_type {};
+
+template <>
+struct is_default_memory_trait<Kokkos::MemoryTraits<0>> : std::true_type {};
+
 }  // namespace Impl
 }  // namespace Kokkos
 
diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
index b9380cbe02..1999d46f3c 100644
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -56,11 +56,11 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 // clang-format off
-template <class> struct infinity_helper;
+template <class> struct infinity_helper {};
 template <> struct infinity_helper<float> { static constexpr float value = HUGE_VALF; };
 template <> struct infinity_helper<double> { static constexpr double value = HUGE_VAL; };
 template <> struct infinity_helper<long double> { static constexpr long double value = HUGE_VALL; };
-template <class> struct finite_min_helper;
+template <class> struct finite_min_helper {};
 template <> struct finite_min_helper<bool> { static constexpr bool value = false; };
 template <> struct finite_min_helper<char> { static constexpr char value = CHAR_MIN; };
 template <> struct finite_min_helper<signed char> { static constexpr signed char value = SCHAR_MIN; };
@@ -76,7 +76,7 @@ template <> struct finite_min_helper<unsigned long long int> { static constexpr
 template <> struct finite_min_helper<float> { static constexpr float value = -FLT_MAX; };
 template <> struct finite_min_helper<double> { static constexpr double value = -DBL_MAX; };
 template <> struct finite_min_helper<long double> { static constexpr long double value = -LDBL_MAX; };
-template <class> struct finite_max_helper;
+template <class> struct finite_max_helper {};
 template <> struct finite_max_helper<bool> { static constexpr bool value = true; };
 template <> struct finite_max_helper<char> { static constexpr char value = CHAR_MAX; };
 template <> struct finite_max_helper<signed char> { static constexpr signed char value = SCHAR_MAX; };
@@ -92,7 +92,7 @@ template <> struct finite_max_helper<unsigned long long int> { static constexpr
 template <> struct finite_max_helper<float> { static constexpr float value = FLT_MAX; };
 template <> struct finite_max_helper<double> { static constexpr double value = DBL_MAX; };
 template <> struct finite_max_helper<long double> { static constexpr long double value = LDBL_MAX; };
-template <class> struct epsilon_helper;
+template <class> struct epsilon_helper {};
 namespace{
   // FIXME workaround for LDL_EPSILON with XL
   template<typename T>
@@ -115,15 +115,15 @@ template <> struct epsilon_helper<long double> {
   static constexpr long double value = LDBL_EPSILON;
 #endif
 };
-template <class> struct round_error_helper;
+template <class> struct round_error_helper {};
 template <> struct round_error_helper<float> { static constexpr float value = 0.5F; };
 template <> struct round_error_helper<double> { static constexpr double value = 0.5; };
 template <> struct round_error_helper<long double> { static constexpr long double value = 0.5L; };
-template <class> struct norm_min_helper;
+template <class> struct norm_min_helper {};
 template <> struct norm_min_helper<float> { static constexpr float value = FLT_MIN; };
 template <> struct norm_min_helper<double> { static constexpr double value = DBL_MIN; };
 template <> struct norm_min_helper<long double> { static constexpr long double value = LDBL_MIN; };
-template <class> struct digits_helper;
+template <class> struct digits_helper {};
 template <> struct digits_helper<bool> { static constexpr int value = 1; };
 template <> struct digits_helper<char> { static constexpr int value = CHAR_BIT - std::is_signed<char>::value; };
 template <> struct digits_helper<signed char> { static constexpr int value = CHAR_BIT - 1; };
@@ -139,11 +139,13 @@ template <> struct digits_helper<unsigned long long int> { static constexpr int
 template <> struct digits_helper<float> { static constexpr int value = FLT_MANT_DIG; };
 template <> struct digits_helper<double> { static constexpr int value = DBL_MANT_DIG; };
 template <> struct digits_helper<long double> { static constexpr int value = LDBL_MANT_DIG; };
-template <class> struct digits10_helper;
+template <class> struct digits10_helper {};
 template <> struct digits10_helper<bool> { static constexpr int value = 0; };
-constexpr double log10_2 = 2.41;
+// The fraction 643/2136 approximates log10(2) to 7 significant digits.
+// Workaround GCC compiler bug with -frounding-math that prevented the
+// floating-point expression to be evaluated at compile time.
 #define DIGITS10_HELPER_INTEGRAL(TYPE) \
-template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * log10_2; };
+template <> struct digits10_helper<TYPE> { static constexpr int value = digits_helper<TYPE>::value * 643L / 2136; };
 DIGITS10_HELPER_INTEGRAL(char)
 DIGITS10_HELPER_INTEGRAL(signed char)
 DIGITS10_HELPER_INTEGRAL(unsigned char)
@@ -159,15 +161,29 @@ DIGITS10_HELPER_INTEGRAL(unsigned long long int)
 template <> struct digits10_helper<float> { static constexpr int value = FLT_DIG; };
 template <> struct digits10_helper<double> { static constexpr int value = DBL_DIG; };
 template <> struct digits10_helper<long double> { static constexpr int value = LDBL_DIG; };
-template <class> struct max_digits10_helper;
-// FIXME not sure why were not defined in my <cfloat>
-//template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
-//template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
-template <> struct max_digits10_helper<float> { static constexpr int value = 9; };
-template <> struct max_digits10_helper<double> { static constexpr int value = 17; };
-template <> struct max_digits10_helper<long double> { static constexpr int value = 21; };
-template <class> struct radix_helper;
+template <class> struct max_digits10_helper {};
+// Approximate ceil(digits<T>::value * log10(2) + 1)
+#define MAX_DIGITS10_HELPER(TYPE) \
+template <> struct max_digits10_helper<TYPE> { static constexpr int value = (digits_helper<TYPE>::value * 643L + 2135) / 2136 + 1; };
+#ifdef FLT_DECIMAL_DIG
+template <> struct max_digits10_helper<float> { static constexpr int value = FLT_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(float)
+#endif
+#ifdef DBL_DECIMAL_DIG
+template <> struct max_digits10_helper<double> { static constexpr int value = DBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(double)
+#endif
+#ifdef DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = DECIMAL_DIG; };
+#elif LDBL_DECIMAL_DIG
+template <> struct max_digits10_helper<long double> { static constexpr int value = LDBL_DECIMAL_DIG; };
+#else
+MAX_DIGITS10_HELPER(long double)
+#endif
+#undef MAX_DIGITS10_HELPER
+template <class> struct radix_helper {};
 template <> struct radix_helper<bool> { static constexpr int value = 2; };
 template <> struct radix_helper<char> { static constexpr int value = 2; };
 template <> struct radix_helper<signed char> { static constexpr int value = 2; };
@@ -183,19 +199,19 @@ template <> struct radix_helper<unsigned long long int> { static constexpr int v
 template <> struct radix_helper<float> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<double> { static constexpr int value = FLT_RADIX; };
 template <> struct radix_helper<long double> { static constexpr int value = FLT_RADIX; };
-template <class> struct min_exponent_helper;
+template <class> struct min_exponent_helper {};
 template <> struct min_exponent_helper<float> { static constexpr int value = FLT_MIN_EXP; };
 template <> struct min_exponent_helper<double> { static constexpr int value = DBL_MIN_EXP; };
 template <> struct min_exponent_helper<long double> { static constexpr int value = LDBL_MIN_EXP; };
-template <class> struct min_exponent10_helper;
+template <class> struct min_exponent10_helper {};
 template <> struct min_exponent10_helper<float> { static constexpr int value = FLT_MIN_10_EXP; };
 template <> struct min_exponent10_helper<double> { static constexpr int value = DBL_MIN_10_EXP; };
 template <> struct min_exponent10_helper<long double> { static constexpr int value = LDBL_MIN_10_EXP; };
-template <class> struct max_exponent_helper;
+template <class> struct max_exponent_helper {};
 template <> struct max_exponent_helper<float> { static constexpr int value = FLT_MAX_EXP; };
 template <> struct max_exponent_helper<double> { static constexpr int value = DBL_MAX_EXP; };
 template <> struct max_exponent_helper<long double> { static constexpr int value = LDBL_MAX_EXP; };
-template <class> struct max_exponent10_helper;
+template <class> struct max_exponent10_helper{};
 template <> struct max_exponent10_helper<float> { static constexpr int value = FLT_MAX_10_EXP; };
 template <> struct max_exponent10_helper<double> { static constexpr int value = DBL_MAX_10_EXP; };
 template <> struct max_exponent10_helper<long double> { static constexpr int value = LDBL_MAX_10_EXP; };
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
index eedba38a84..8f12eceb27 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMP.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -62,7 +62,6 @@
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
@@ -105,9 +104,11 @@ class OpenMP {
   /// \brief Wait until all dispatched functors complete on the given instance
   ///
   ///  This is a no-op on OpenMP
-  static void impl_static_fence(OpenMP const& = OpenMP()) noexcept;
+  static void impl_static_fence(OpenMP const&           = OpenMP(),
+                                const std::string& name = "") noexcept;
 
   void fence() const;
+  void fence(const std::string& name) const;
 
   /// \brief Does the given instance return immediately after launching
   /// a parallel algorithm
@@ -167,7 +168,7 @@ class OpenMP {
   static int impl_get_current_max_threads() noexcept;
 
   static constexpr const char* name() noexcept { return "OpenMP"; }
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 };
 
 namespace Tools {
@@ -188,6 +189,7 @@ class OpenMPSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp b/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
index 2a57a43e63..f394f32408 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
@@ -56,9 +56,8 @@
 #include <Kokkos_OpenMPTargetSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_Parallel.hpp>
-#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_TaskScheduler.hpp>
 #include <Kokkos_Layout.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
 #include <KokkosExp_MDRangePolicy.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
@@ -92,7 +91,10 @@ class OpenMPTarget {
   inline static bool in_parallel() { return omp_in_parallel(); }
 
   static void fence();
+  static void fence(const std::string&);
 
+  static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
 
@@ -115,7 +117,7 @@ class OpenMPTarget {
   }
 
   OpenMPTarget();
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept;
 
  private:
   Impl::OpenMPTargetInternal* m_space_instance;
@@ -141,6 +143,7 @@ class OpenMPTargetSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
index 58d723ac11..c1d338331f 100644
--- a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
@@ -89,6 +89,41 @@ namespace Impl {
 }  // namespace Impl
 }  // namespace Kokkos
 
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+//----------------------------------------
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                         Kokkos::Experimental::OpenMPTargetSpace> {
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = false };
+};
+}  // namespace Impl
+}  // namespace Kokkos
+
 namespace Kokkos {
 namespace Experimental {
 
@@ -213,7 +248,10 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, OpenMPTargetSpace>: fence "
+        "before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -231,7 +269,9 @@ struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace, HostSpace,
                                        omp_get_initial_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<OpenMPTargetSpace, HostSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_default_device(),
@@ -249,7 +289,9 @@ struct DeepCopy<HostSpace, Kokkos::Experimental::OpenMPTargetSpace,
                                        omp_get_default_device()));
   }
   DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
+    exec.fence(
+        "Kokkos::Impl::DeepCopy<HostSpace, OpenMPTargetSpace>: fence before "
+        "copy");
     if (n > 0)
       OMPT_SAFE_CALL(omp_target_memcpy(dst, const_cast<void*>(src), n, 0, 0,
                                        omp_get_initial_device(),
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
index 85d1dad454..25ebe26155 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -48,23 +48,19 @@
 #ifndef KOKKOS_PARALLEL_HPP
 #define KOKKOS_PARALLEL_HPP
 
-#include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
-#include <Kokkos_View.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_View.hpp>
 
 #include <impl/Kokkos_Tools.hpp>
-#include <type_traits>
-#include <typeinfo>
-
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_Traits.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#ifdef KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-#include <iostream>
-#endif
+#include <cstddef>
+#include <type_traits>
+#include <typeinfo>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -72,34 +68,11 @@
 namespace Kokkos {
 namespace Impl {
 
-template <class T, class = void>
-struct is_detected_execution_space : std::false_type {
-  using type = not_a_type;
-};
+template <class T>
+using execution_space_t = typename T::execution_space;
 
 template <class T>
-struct is_detected_execution_space<T, void_t<typename T::execution_space>>
-    : std::true_type {
-  using type = typename T::execution_space;
-};
-
-template <class T>
-using detected_execution_space_t =
-    typename is_detected_execution_space<T>::type;
-
-template <class T, class = void>
-struct is_detected_device_type : std::false_type {
-  using type = not_a_type;
-};
-
-template <class T>
-struct is_detected_device_type<T, void_t<typename T::device_type>>
-    : std::true_type {
-  using type = typename T::device_type;
-};
-
-template <class T>
-using detected_device_type_t = typename is_detected_device_type<T>::type;
+using device_type_t = typename T::device_type;
 
 //----------------------------------------------------------------------------
 /** \brief  Given a Functor and Execution Policy query an execution space.
@@ -112,16 +85,14 @@ using detected_device_type_t = typename is_detected_device_type<T>::type;
 
 template <class Functor, class Policy>
 struct FunctorPolicyExecutionSpace {
-  using execution_space = std::conditional_t<
-      is_detected_execution_space<Policy>::value,
-      detected_execution_space_t<Policy>,
-      std::conditional_t<
-          is_detected_execution_space<Functor>::value,
-          detected_execution_space_t<Functor>,
+  using execution_space = detected_or_t<
+      detected_or_t<
           std::conditional_t<
-              is_detected_device_type<Functor>::value,
-              detected_execution_space_t<detected_device_type_t<Functor>>,
-              Kokkos::DefaultExecutionSpace>>>;
+              is_detected<device_type_t, Functor>::value,
+              detected_t<execution_space_t, detected_t<device_type_t, Functor>>,
+              Kokkos::DefaultExecutionSpace>,
+          execution_space_t, Functor>,
+      execution_space_t, Policy>;
 };
 
 }  // namespace Impl
@@ -158,8 +129,7 @@ inline void parallel_for(
     const ExecPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecPolicy>::value>::type* = nullptr) {
   uint64_t kpID = 0;
 
   ExecPolicy inner_policy = policy;
@@ -200,18 +170,7 @@ inline void parallel_for(const size_t work_count, const FunctorType& functor,
 template <class ExecPolicy, class FunctorType>
 inline void parallel_for(const std::string& str, const ExecPolicy& policy,
                          const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_for(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
@@ -255,9 +214,12 @@ namespace Kokkos {
 ///   // operator() or join().
 ///   using value_type = PodType;
 ///
-///   void operator () (const ExecPolicy::member_type & i, value_type& update,
-///   const bool final_pass) const; void init (value_type& update) const; void
-///   join (volatile value_type& update, volatile const value_type& input) const
+///   void operator () (const ExecPolicy::member_type & i,
+///                     value_type& update,
+///                     const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update,
+//               volatile const value_type& input) const
 /// };
 /// \endcode
 ///
@@ -389,8 +351,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -430,18 +391,7 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 template <class ExecutionPolicy, class FunctorType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
@@ -449,8 +399,7 @@ inline void parallel_scan(
     const ExecutionPolicy& policy, const FunctorType& functor,
     ReturnType& return_value, const std::string& str = "",
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<ExecutionPolicy>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<ExecutionPolicy>::value>::type* = nullptr) {
   uint64_t kpID                = 0;
   ExecutionPolicy inner_policy = policy;
   Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID);
@@ -464,7 +413,8 @@ inline void parallel_scan(
 
   Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID);
 
-  policy.space().fence();
+  policy.space().fence(
+      "Kokkos::parallel_scan: fence due to result being a value, not a view");
 }
 
 template <class FunctorType, class ReturnType>
@@ -491,25 +441,15 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor,
 
   Kokkos::Tools::Impl::end_parallel_scan(execution_policy, functor, str, kpID);
 
-  execution_space().fence();
+  execution_space().fence(
+      "Kokkos::parallel_scan: fence after scan with return value");
 }
 
 template <class ExecutionPolicy, class FunctorType, class ReturnType>
 inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy,
                           const FunctorType& functor,
                           ReturnType& return_value) {
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
-#endif
-
   ::Kokkos::parallel_scan(policy, functor, return_value, str);
-
-#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
-  Kokkos::fence();
-  std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
-#endif
-  (void)str;
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 96242f99b0..bc613cea62 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -811,7 +811,7 @@ struct ParallelReducePolicyType;
 template <class PolicyType, class FunctorType>
 struct ParallelReducePolicyType<
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type,
+        Kokkos::is_execution_policy<PolicyType>::value>::type,
     PolicyType, FunctorType> {
   using policy_type = PolicyType;
   static PolicyType policy(const PolicyType& policy_) { return policy_; }
@@ -948,9 +948,10 @@ parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) {
 template <class ExecutionSpace, class... Args>
 struct ParallelReduceFence {
   template <class... ArgsDeduced>
-  static void fence(const ExecutionSpace& ex, ArgsDeduced&&... args) {
+  static void fence(const ExecutionSpace& ex, const std::string& name,
+                    ArgsDeduced&&... args) {
     if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) {
-      ex.fence();
+      ex.fence(name);
     }
   }
 };
@@ -974,7 +975,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> & update ,
  *               volatile const <podType> & input ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> & update ) const ;
  *  };
  * \endcode
@@ -991,7 +991,6 @@ struct ParallelReduceFence {
  *    void join( volatile       <podType> update[] ,
  *               volatile const <podType> input[] ) const ;
  *
- *    using has_final = true_type;
  *    void final( <podType> update[] ) const ;
  *  };
  * \endcode
@@ -1001,24 +1000,30 @@ struct ParallelReduceFence {
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 ReturnType& return_value) {
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1030,7 +1035,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1043,33 +1051,42 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // ReturnValue as View or Reducer: take by copy to allow for inline construction
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const std::string& label, const PolicyType& policy,
                 const FunctorType& functor, const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       label, policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class PolicyType, class FunctorType, class ReturnType>
 inline typename std::enable_if<
-    Kokkos::Impl::is_execution_policy<PolicyType>::value>::type
+    Kokkos::is_execution_policy<PolicyType>::value>::type
 parallel_reduce(const PolicyType& policy, const FunctorType& functor,
                 const ReturnType& return_value) {
   ReturnType return_value_impl = return_value;
   Impl::ParallelReduceAdaptor<PolicyType, FunctorType, ReturnType>::execute(
       "", policy, functor, return_value_impl);
-  Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            ReturnType>::fence(policy.space(), return_value);
+  Impl::ParallelReduceFence<typename PolicyType::execution_space, ReturnType>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1082,7 +1099,10 @@ inline void parallel_reduce(const size_t& policy, const FunctorType& functor,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       "", policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 template <class FunctorType, class ReturnType>
@@ -1096,7 +1116,10 @@ inline void parallel_reduce(const std::string& label, const size_t& policy,
   Impl::ParallelReduceAdaptor<policy_type, FunctorType, ReturnType>::execute(
       label, policy_type(0, policy), functor, return_value_impl);
   Impl::ParallelReduceFence<typename policy_type::execution_space, ReturnType>::
-      fence(typename policy_type::execution_space(), return_value);
+      fence(
+          typename policy_type::execution_space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          return_value);
 }
 
 // No Return Argument
@@ -1106,8 +1129,7 @@ inline void parallel_reduce(
     const std::string& label, const PolicyType& policy,
     const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
@@ -1131,8 +1153,7 @@ template <class PolicyType, class FunctorType>
 inline void parallel_reduce(
     const PolicyType& policy, const FunctorType& functor,
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type* =
-        nullptr) {
+        Kokkos::is_execution_policy<PolicyType>::value>::type* = nullptr) {
   using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
   using value_type  = std::conditional_t<(ValueTraits::StaticValueSize != 0),
                                         typename ValueTraits::value_type,
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp b/lib/kokkos/core/src/Kokkos_Rank.hpp
similarity index 71%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
rename to lib/kokkos/core/src/Kokkos_Rank.hpp
index 5eed2ca0d7..3603e28608 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_b.cpp
+++ b/lib/kokkos/core/src/Kokkos_Rank.hpp
@@ -42,5 +42,30 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
+#ifndef KOKKOS_KOKKOS_RANK_HPP
+#define KOKKOS_KOKKOS_RANK_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Layout.hpp>  // Iterate
+
+namespace Kokkos {
+
+// Iteration Pattern
+template <unsigned N, Iterate OuterDir = Iterate::Default,
+          Iterate InnerDir = Iterate::Default>
+struct Rank {
+  static_assert(N != 0u, "Kokkos Error: rank 0 undefined");
+  static_assert(N != 1u,
+                "Kokkos Error: rank 1 is not a multi-dimensional range");
+  static_assert(N < 7u, "Kokkos Error: Unsupported rank...");
+
+  using iteration_pattern = Rank<N, OuterDir, InnerDir>;
+
+  static constexpr int rank                = N;
+  static constexpr Iterate outer_direction = OuterDir;
+  static constexpr Iterate inner_direction = InnerDir;
+};
+
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_RANK_HPP
diff --git a/lib/kokkos/core/src/Kokkos_SYCL.hpp b/lib/kokkos/core/src/Kokkos_SYCL.hpp
index 8ee76b4386..02095ff7b3 100644
--- a/lib/kokkos/core/src/Kokkos_SYCL.hpp
+++ b/lib/kokkos/core/src/Kokkos_SYCL.hpp
@@ -83,7 +83,9 @@ class SYCL {
   SYCL();
   explicit SYCL(const sycl::queue&);
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept {
+    return m_space_instance->impl_get_instance_id();
+  }
 
   sycl::context sycl_context() const noexcept {
     return m_space_instance->m_queue->get_context();
@@ -110,7 +112,9 @@ class SYCL {
 
   /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
   static void impl_static_fence();
+  static void impl_static_fence(const std::string&);
   void fence() const;
+  void fence(const std::string&) const;
 
   /// \brief Print configuration information to the given output stream.
   void print_configuration(std::ostream&, const bool detail = false);
@@ -165,6 +169,7 @@ class SYCLSpaceInitializer : public Kokkos::Impl::ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -181,6 +186,41 @@ struct DeviceTypeTraits<Kokkos::Experimental::SYCL> {
 }  // namespace Experimental
 }  // namespace Tools
 
+namespace Experimental {
+template <class... Args>
+std::vector<SYCL> partition_space(const SYCL& sycl_space, Args...) {
+#ifdef __cpp_fold_expressions
+  static_assert(
+      (... && std::is_arithmetic_v<Args>),
+      "Kokkos Error: partitioning arguments must be integers or floats");
+#endif
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(sizeof...(Args));
+  for (unsigned int i = 0; i < sizeof...(Args); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+
+template <class T>
+std::vector<SYCL> partition_space(const SYCL& sycl_space,
+                                  std::vector<T>& weights) {
+  static_assert(
+      std::is_arithmetic<T>::value,
+      "Kokkos Error: partitioning arguments must be integers or floats");
+
+  sycl::context context = sycl_space.sycl_context();
+  sycl::default_selector device_selector;
+  std::vector<SYCL> instances;
+  instances.reserve(weights.size());
+  for (unsigned int i = 0; i < weights.size(); ++i)
+    instances.emplace_back(sycl::queue(context, device_selector));
+  return instances;
+}
+}  // namespace Experimental
+
 }  // namespace Kokkos
 
 #endif
diff --git a/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
index 392ab0e59a..15ef11024d 100644
--- a/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
+++ b/lib/kokkos/core/src/Kokkos_SYCL_Space.hpp
@@ -49,12 +49,19 @@
 
 #ifdef KOKKOS_ENABLE_SYCL
 #include <Kokkos_Concepts.hpp>
+#include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <SYCL/Kokkos_SYCL_Instance.hpp>
 #include <impl/Kokkos_SharedAlloc.hpp>
 #include <impl/Kokkos_Tools.hpp>
 
 namespace Kokkos {
+
+namespace Impl {
+template <typename T>
+struct is_sycl_type_space : public std::false_type {};
+}  // namespace Impl
+
 namespace Experimental {
 
 class SYCLDeviceUSMSpace {
@@ -118,9 +125,54 @@ class SYCLSharedUSMSpace {
  private:
   sycl::queue m_queue;
 };
+
+class SYCLHostUSMSpace {
+ public:
+  using execution_space = HostSpace::execution_space;
+  using memory_space    = SYCLHostUSMSpace;
+  using device_type     = Kokkos::Device<execution_space, memory_space>;
+  using size_type       = Impl::SYCLInternal::size_type;
+
+  SYCLHostUSMSpace();
+  explicit SYCLHostUSMSpace(sycl::queue queue);
+
+  void* allocate(const std::size_t arg_alloc_size) const;
+  void* allocate(const char* arg_label, const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const;
+
+  void deallocate(void* const arg_alloc_ptr,
+                  const std::size_t arg_alloc_size) const;
+  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
+                  const size_t arg_alloc_size,
+                  const size_t arg_logical_size = 0) const;
+
+ private:
+  template <class, class, class, class>
+  friend class LogicalMemorySpace;
+
+ public:
+  static constexpr const char* name() { return "SYCLHostUSM"; };
+
+ private:
+  sycl::queue m_queue;
+};
+
 }  // namespace Experimental
 
 namespace Impl {
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLDeviceUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLSharedUSMSpace>
+    : public std::true_type {};
+
+template <>
+struct is_sycl_type_space<Kokkos::Experimental::SYCLHostUSMSpace>
+    : public std::true_type {};
+
 static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLDeviceUSMSpace,
                   Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
@@ -131,6 +183,11 @@ static_assert(Kokkos::Impl::MemorySpaceAccess<
                   Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
               "");
 
+static_assert(Kokkos::Impl::MemorySpaceAccess<
+                  Kokkos::Experimental::SYCLDeviceUSMSpace,
+                  Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+              "");
+
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace,
                          Kokkos::Experimental::SYCLDeviceUSMSpace> {
@@ -148,6 +205,16 @@ struct MemorySpaceAccess<Kokkos::HostSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::HostSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // HostSpace::execution_space ==
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = true };
+  enum : bool { accessible = true };
+  enum : bool { deepcopy = true };
+};
+
 template <>
 struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
                          Kokkos::HostSpace> {
@@ -165,6 +232,18 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
   enum : bool { deepcopy = true };
 };
 
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLDeviceUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { assignable = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLDeviceUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
 //----------------------------------------
 // SYCLSharedUSMSpace::execution_space == SYCL
 // SYCLSharedUSMSpace accessible to both SYCL and Host
@@ -191,17 +270,46 @@ struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
 };
 
 template <>
-struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLDeviceUSMSpace,
-    Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCLHostUSMSpace> {
+  // Experimental::SYCLSharedUSMSpace::execution_space !=
+  // Experimental::SYCLHostUSMSpace::execution_space
   enum : bool { assignable = false };
-  enum : bool { accessible = true };
-  enum : bool { deepcopy = false };
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLSharedUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace> {
+  enum : bool { assignable = false };  // Cannot access from SYCL
+  enum : bool {
+    accessible = true
+  };  // Experimental::SYCLHostUSMSpace::execution_space
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLDeviceUSMSpace> {
+  enum : bool { assignable = false };  // Cannot access from Host
+  enum : bool { accessible = false };
+  enum : bool { deepcopy = true };
+};
+
+template <>
+struct MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCLSharedUSMSpace> {
+  enum : bool { assignable = false };  // different execution_space
+  enum : bool { accessible = true };   // same accessibility
+  enum : bool { deepcopy = true };
 };
 
 template <>
 struct MemorySpaceAccess<
-    Kokkos::Experimental::SYCLSharedUSMSpace,
+    Kokkos::Experimental::SYCLDeviceUSMSpace,
     Kokkos::ScratchMemorySpace<Kokkos::Experimental::SYCL>> {
   enum : bool { assignable = false };
   enum : bool { accessible = true };
@@ -276,6 +384,37 @@ class SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>
       const RecordBase::function_type arg_dealloc = &base_t::deallocate);
 };
 
+template <>
+class SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>
+    : public SharedAllocationRecordCommon<
+          Kokkos::Experimental::SYCLHostUSMSpace> {
+ private:
+  friend class SharedAllocationRecordCommon<
+      Kokkos::Experimental::SYCLHostUSMSpace>;
+  using base_t =
+      SharedAllocationRecordCommon<Kokkos::Experimental::SYCLHostUSMSpace>;
+  using RecordBase = SharedAllocationRecord<void, void>;
+
+  SharedAllocationRecord(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord(SharedAllocationRecord&&)      = delete;
+  SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
+  SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete;
+
+  static RecordBase s_root_record;
+
+  const Kokkos::Experimental::SYCLHostUSMSpace m_space;
+
+ protected:
+  ~SharedAllocationRecord();
+
+  SharedAllocationRecord() = default;
+
+  SharedAllocationRecord(
+      const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+      const std::string& arg_label, const size_t arg_alloc_size,
+      const RecordBase::function_type arg_dealloc = &base_t::deallocate);
+};
+
 }  // namespace Impl
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 2eebf5365e..bb740cfb86 100644
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -148,10 +148,10 @@ class ScratchMemorySpace {
                                             const IntType& size_L0,
                                             void* ptr_L1           = nullptr,
                                             const IntType& size_L1 = 0)
-      : m_iter_L0((char*)ptr_L0),
-        m_iter_L1((char*)ptr_L1),
-        m_end_L0((char*)ptr_L0 + size_L0),
-        m_end_L1((char*)ptr_L1 + size_L1),
+      : m_iter_L0(static_cast<char*>(ptr_L0)),
+        m_iter_L1(static_cast<char*>(ptr_L1)),
+        m_end_L0(static_cast<char*>(ptr_L0) + size_L0),
+        m_end_L1(static_cast<char*>(ptr_L1) + size_L1),
         m_multiplier(1),
         m_offset(0),
         m_default_level(0) {}
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index 4d5bb2410b..9c8ae70721 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -53,6 +53,8 @@
 
 #include <cstddef>
 #include <iosfwd>
+#include <mutex>
+#include <thread>
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_Parallel.hpp>
 #include <Kokkos_TaskScheduler.hpp>
@@ -60,12 +62,12 @@
 #include <Kokkos_HostSpace.hpp>
 #include <Kokkos_ScratchSpace.hpp>
 #include <Kokkos_MemoryTraits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_HostThreadTeam.hpp>
 #include <impl/Kokkos_FunctorAnalysis.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 #include <impl/Kokkos_Tools.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
+#include <impl/Kokkos_HostSharedPtr.hpp>
 
 #include <KokkosExp_MDRangePolicy.hpp>
 
@@ -73,6 +75,32 @@
 
 namespace Kokkos {
 
+namespace Impl {
+class SerialInternal {
+ public:
+  SerialInternal() = default;
+
+  bool is_initialized();
+
+  void initialize();
+
+  void finalize();
+
+  static SerialInternal& singleton();
+
+  std::mutex m_thread_team_data_mutex;
+
+  // Resize thread team data scratch memory
+  void resize_thread_team_data(size_t pool_reduce_bytes,
+                               size_t team_reduce_bytes,
+                               size_t team_shared_bytes,
+                               size_t thread_local_bytes);
+
+  HostThreadTeamData m_thread_team_data;
+  bool m_is_initialized = false;
+};
+}  // namespace Impl
+
 /// \class Serial
 /// \brief Kokkos device for non-parallel execution
 ///
@@ -107,6 +135,8 @@ class Serial {
 
   //@}
 
+  Serial();
+
   /// \brief True if and only if this method is being called in a
   ///   thread-parallel function.
   ///
@@ -121,9 +151,26 @@ class Serial {
   /// return asynchronously, before the functor completes.  This
   /// method does not return until all dispatched functors on this
   /// device have completed.
-  static void impl_static_fence() {}
+  static void impl_static_fence() {
+    impl_static_fence(
+        "Kokkos::Serial::impl_static_fence: Unnamed Static Fence");
+  }
+  static void impl_static_fence(const std::string& name) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
-  void fence() const {}
+  void fence() const { fence("Kokkos::Serial::fence: Unnamed Instance Fence"); }
+  void fence(const std::string& name) const {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Serial>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        []() {});  // TODO: correct device ID
+    Kokkos::memory_fence();
+  }
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency() { return 1; }
@@ -153,9 +200,24 @@ class Serial {
     return impl_thread_pool_size(0);
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
+
+  Impl::SerialInternal* impl_internal_space_instance() const {
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    return m_space_instance;
+#else
+    return m_space_instance.get();
+#endif
+  }
+
+ private:
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+  Impl::SerialInternal* m_space_instance;
+#else
+  Kokkos::Impl::HostSharedPtr<Impl::SerialInternal> m_space_instance;
+#endif
   //--------------------------------------------------------------------------
 };
 
@@ -177,6 +239,7 @@ class SerialSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
@@ -206,20 +269,6 @@ struct MemorySpaceAccess<Kokkos::Serial::memory_space,
 namespace Kokkos {
 namespace Impl {
 
-// Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes);
-
-HostThreadTeamData* serial_get_thread_team_data();
-
-} /* namespace Impl */
-} /* namespace Kokkos */
-
-namespace Kokkos {
-namespace Impl {
-
 /*
  * < Kokkos::Serial , WorkArgTag >
  * < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial ,
@@ -510,13 +559,19 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -606,13 +661,18 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
   }
@@ -667,13 +727,18 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    // Need to lock resize_thread_team_data
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    reference_type update =
-        ValueInit::init(m_functor, pointer_type(data.pool_reduce_local()));
+    reference_type update = ValueInit::init(
+        m_functor,
+        pointer_type(
+            internal_instance->m_thread_team_data.pool_reduce_local()));
 
     this->template exec<WorkTag>(update);
 
@@ -797,13 +862,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -869,6 +940,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using Member = typename Policy::member_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const int m_shared;
 
@@ -896,16 +968,21 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
-    HostThreadTeamData& data = *serial_get_thread_team_data();
-
-    this->template exec<typename Policy::work_tag>(data);
+    this->template exec<typename Policy::work_tag>(
+        internal_instance->m_thread_team_data);
   }
 
   ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
                  FunctorTeamShmemSize<FunctorType>::value(arg_functor, 1)) {}
@@ -941,6 +1018,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename Analysis::reference_type;
 
   const FunctorType m_functor;
+  const Policy m_policy;
   const int m_league;
   const ReducerType m_reducer;
   pointer_type m_result_ptr;
@@ -973,18 +1051,24 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const size_t team_shared_size  = m_shared;
     const size_t thread_local_size = 0;  // Never shrinks
 
-    serial_resize_thread_team_data(pool_reduce_size, team_reduce_size,
-                                   team_shared_size, thread_local_size);
-
-    HostThreadTeamData& data = *serial_get_thread_team_data();
+    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    // Need to lock resize_thread_team_data
+    std::lock_guard<std::mutex> lock(
+        internal_instance->m_thread_team_data_mutex);
+    internal_instance->resize_thread_team_data(
+        pool_reduce_size, team_reduce_size, team_shared_size,
+        thread_local_size);
 
     pointer_type ptr =
-        m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
+        m_result_ptr
+            ? m_result_ptr
+            : pointer_type(
+                  internal_instance->m_thread_team_data.pool_reduce_local());
 
     reference_type update =
         ValueInit::init(ReducerConditional::select(m_functor, m_reducer), ptr);
 
-    this->template exec<WorkTag>(data, update);
+    this->template exec<WorkTag>(internal_instance->m_thread_team_data, update);
 
     Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
         ReducerConditional::select(m_functor, m_reducer), ptr);
@@ -998,6 +1082,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -1016,6 +1101,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
       : m_functor(arg_functor),
+        m_policy(arg_policy),
         m_league(arg_policy.league_size()),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
index 91e079a0e7..9751fab460 100644
--- a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -43,5 +43,9 @@
 */
 
 // For backward compatibility:
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING(
+    "This file is deprecated. Use <Kokkos_TaskScheduler.hpp> instead.")
 
 #include <Kokkos_TaskScheduler.hpp>
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index 743273670c..17e78f5e81 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
@@ -372,7 +371,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
         task_base* const t = arg[i].m_task;
         if (nullptr != t) {
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(t->m_ref_count));
+          // This likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(t->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           if (q != static_cast<queue_type const*>(t->m_queue)) {
             Kokkos::abort(
                 "Kokkos when_all Futures must be in the same scheduler");
@@ -467,7 +469,10 @@ class BasicTaskScheduler : public Impl::TaskSchedulerBase {
           //  scheduler" );
           //}
           // Increment reference count to track subsequent assignment.
-          Kokkos::atomic_increment(&(arg_f.m_task->m_ref_count));
+          // This increment likely has to be SeqCst
+          Kokkos::Impl::desul_atomic_inc(&(arg_f.m_task->m_ref_count),
+                                         Kokkos::Impl::MemoryOrderSeqCst(),
+                                         Kokkos::Impl::MemoryScopeDevice());
           dep[i] = arg_f.m_task;
         }
       }
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
index e827c2a2a1..da9bea9c23 100644
--- a/lib/kokkos/core/src/Kokkos_Threads.hpp
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -57,7 +57,6 @@
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_MemoryTraits.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
-#include <impl/Kokkos_Tags.hpp>
 #include <impl/Kokkos_ExecSpaceInitializer.hpp>
 
 /*--------------------------------------------------------------------------*/
@@ -65,6 +64,7 @@
 namespace Kokkos {
 namespace Impl {
 class ThreadsExec;
+enum class fence_is_static { yes, no };
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -108,8 +108,10 @@ class Threads {
   /// method does not return until all dispatched functors on this
   /// device have completed.
   static void impl_static_fence();
+  static void impl_static_fence(const std::string& name);
 
   void fence() const;
+  void fence(const std::string&) const;
 
   /** \brief  Return the maximum amount of concurrency.  */
   static int concurrency();
@@ -167,7 +169,7 @@ class Threads {
     return impl_thread_pool_rank();
   }
 
-  uint32_t impl_instance_id() const noexcept { return 0; }
+  uint32_t impl_instance_id() const noexcept { return 1; }
 
   static const char* name();
   //@}
@@ -192,6 +194,7 @@ class ThreadsSpaceInitializer : public ExecSpaceInitializerBase {
   void initialize(const InitArguments& args) final;
   void finalize(const bool) final;
   void fence() final;
+  void fence(const std::string&) final;
   void print_configuration(std::ostream& msg, const bool detail) final;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp
index f7cc34cc11..52edd82052 100644
--- a/lib/kokkos/core/src/Kokkos_Tuners.hpp
+++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp
@@ -306,7 +306,11 @@ class MultidimensionalSparseTuningProblem {
   static constexpr size_t max_space_dimension_size = MaxDimensionSize;
   static constexpr double tuning_min               = 0.0;
   static constexpr double tuning_max               = 0.999;
-  static constexpr double tuning_step = tuning_max / max_space_dimension_size;
+
+  // Not declared as static constexpr to work around the following compiler bug
+  // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96862
+  // where a floating-point expression cannot be constexpr under -frounding-math
+  double tuning_step = tuning_max / max_space_dimension_size;
 
   using StoredProblemSpace =
       typename Impl::MapTypeConverter<ProblemSpaceInput>::type;
@@ -315,17 +319,45 @@ class MultidimensionalSparseTuningProblem {
 
   using ValueArray = std::array<Kokkos::Tools::Experimental::VariableValue,
                                 space_dimensionality>;
+  template <class Key, class Value>
+  using extended_map = std::map<Key, Value>;
+  template <typename Key>
+  using extended_problem =
+      MultidimensionalSparseTuningProblem<extended_map, MaxDimensionSize, Key,
+                                          ProblemSpaceInput>;
+  template <typename Key, typename Value>
+  using ExtendedProblemSpace =
+      typename Impl::MapTypeConverter<extended_map<Key, Value>>::type;
+
+  template <typename Key>
+  auto extend(const std::string& axis_name,
+              const std::vector<Key>& new_tuning_axis) const
+      -> extended_problem<Key> {
+    ExtendedProblemSpace<Key, ProblemSpaceInput> extended_space;
+    for (auto& key : new_tuning_axis) {
+      extended_space.add_root_value(key);
+      extended_space.add_sub_container(m_space);
+    }
+    std::vector<std::string> extended_names;
+    extended_names.reserve(m_variable_names.size() + 1);
+    extended_names.push_back(axis_name);
+    extended_names.insert(extended_names.end(), m_variable_names.begin(),
+                          m_variable_names.end());
+    return extended_problem<Key>(extended_space, extended_names);
+  }
 
  private:
   StoredProblemSpace m_space;
   std::array<size_t, space_dimensionality> variable_ids;
+  std::vector<std::string> m_variable_names;
   size_t context;
 
  public:
   MultidimensionalSparseTuningProblem() = default;
-  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+
+  MultidimensionalSparseTuningProblem(StoredProblemSpace space,
                                       const std::vector<std::string>& names)
-      : m_space(HierarchyConstructor::build(space)) {
+      : m_space(std::move(space)), m_variable_names(names) {
     assert(names.size() == space_dimensionality);
     for (unsigned long x = 0; x < names.size(); ++x) {
       VariableInfo info;
@@ -340,6 +372,20 @@ class MultidimensionalSparseTuningProblem {
     }
   }
 
+  MultidimensionalSparseTuningProblem(ProblemSpaceInput space,
+                                      const std::vector<std::string>& names)
+      : MultidimensionalSparseTuningProblem(HierarchyConstructor::build(space),
+                                            names) {}
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    using ArrayType = std::array<Kokkos::Tools::Experimental::VariableValue,
+                                 sizeof...(coordinates)>;
+    return Impl::get_point(
+        m_space, ArrayType({Kokkos::Tools::Experimental::make_variable_value(
+                     0, static_cast<double>(coordinates))...}));
+  }
+
   auto begin() {
     context = Kokkos::Tools::Experimental::get_new_context_id();
     ValueArray values;
@@ -349,12 +395,28 @@ class MultidimensionalSparseTuningProblem {
     }
     begin_context(context);
     request_output_values(context, space_dimensionality, values.data());
-    return get_point(m_space, values);
+    return Impl::get_point(m_space, values);
   }
 
   auto end() { end_context(context); }
 };
 
+template <typename Tuner>
+struct ExtendableTunerMixin {
+  template <typename Key>
+  auto combine(const std::string& axis_name,
+               const std::vector<Key>& new_axis) const {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.extend(axis_name, new_axis);
+  }
+
+  template <typename... Coordinates>
+  auto get_point(Coordinates... coordinates) {
+    const auto& sub_tuner = static_cast<const Tuner*>(this)->get_tuner();
+    return sub_tuner.get_point(coordinates...);
+  }
+};
+
 template <size_t MaxDimensionSize = 100, template <class...> class Container,
           class... TemplateArguments>
 auto make_multidimensional_sparse_tuning_problem(
@@ -362,7 +424,8 @@ auto make_multidimensional_sparse_tuning_problem(
   return MultidimensionalSparseTuningProblem<Container, MaxDimensionSize,
                                              TemplateArguments...>(in, names);
 }
-class TeamSizeTuner {
+
+class TeamSizeTuner : public ExtendableTunerMixin<TeamSizeTuner> {
  private:
   using SpaceDescription = std::map<int64_t, std::vector<int64_t>>;
   using TunerType = decltype(make_multidimensional_sparse_tuning_problem<20>(
@@ -481,7 +544,7 @@ class TeamSizeTuner {
     }
   }
 
- private:
+  TunerType get_tuner() const { return tuner; }
 };
 
 namespace Impl {
@@ -501,7 +564,7 @@ void fill_tile(std::map<T, Mapped>& cont, int tile_size) {
 }  // namespace Impl
 
 template <int MDRangeRank>
-struct MDRangeTuner {
+struct MDRangeTuner : public ExtendableTunerMixin<MDRangeTuner<MDRangeRank>> {
  private:
   static constexpr int rank       = MDRangeRank;
   static constexpr int max_slices = 15;
@@ -548,8 +611,45 @@ struct MDRangeTuner {
       tuner.end();
     }
   }
+
+  TunerType get_tuner() const { return tuner; }
 };
 
+template <class Choice>
+struct CategoricalTuner {
+  using choice_list = std::vector<Choice>;
+  choice_list choices;
+  size_t context;
+  size_t tuning_variable_id;
+  CategoricalTuner(std::string name, choice_list m_choices)
+      : choices(m_choices) {
+    std::vector<int64_t> indices;
+    for (typename decltype(choices)::size_type x = 0; x < choices.size(); ++x) {
+      indices.push_back(x);
+    }
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_set;
+    info.type          = ValueType::kokkos_value_int64;
+    info.candidates    = make_candidate_set(indices.size(), indices.data());
+    tuning_variable_id = declare_output_type(name, info);
+  }
+  const Choice& begin() {
+    context = get_new_context_id();
+    begin_context(context);
+    VariableValue value = make_variable_value(tuning_variable_id, int64_t(0));
+    request_output_values(context, 1, &value);
+    return choices[value.value.int_value];
+  }
+  void end() { end_context(context); }
+};
+
+template <typename Choice>
+auto make_categorical_tuner(std::string name, std::vector<Choice> choices)
+    -> CategoricalTuner<Choice> {
+  return CategoricalTuner<Choice>(name, choices);
+}
+
 }  // namespace Experimental
 }  // namespace Tools
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index 1abe0a48df..b217cc4bc1 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -190,9 +190,9 @@ struct ViewTraits<void, void, Prop...> {
 };
 
 template <class ArrayLayout, class... Prop>
-struct ViewTraits<typename std::enable_if<
-                      Kokkos::Impl::is_array_layout<ArrayLayout>::value>::type,
-                  ArrayLayout, Prop...> {
+struct ViewTraits<
+    typename std::enable_if<Kokkos::is_array_layout<ArrayLayout>::value>::type,
+    ArrayLayout, Prop...> {
   // Specify layout, keep subsequent space and memory traits arguments
 
   using execution_space = typename ViewTraits<void, Prop...>::execution_space;
@@ -204,9 +204,8 @@ struct ViewTraits<typename std::enable_if<
 };
 
 template <class Space, class... Prop>
-struct ViewTraits<
-    typename std::enable_if<Kokkos::Impl::is_space<Space>::value>::type, Space,
-    Prop...> {
+struct ViewTraits<typename std::enable_if<Kokkos::is_space<Space>::value>::type,
+                  Space, Prop...> {
   // Specify Space, memory traits should be the only subsequent argument.
 
   static_assert(
@@ -230,8 +229,8 @@ struct ViewTraits<
 };
 
 template <class MemoryTraits, class... Prop>
-struct ViewTraits<typename std::enable_if<Kokkos::Impl::is_memory_traits<
-                      MemoryTraits>::value>::type,
+struct ViewTraits<typename std::enable_if<
+                      Kokkos::is_memory_traits<MemoryTraits>::value>::type,
                   MemoryTraits, Prop...> {
   // Specify memory trait, should not be any subsequent arguments
 
@@ -1543,7 +1542,8 @@ class View : public ViewTraits<DataType, Properties...> {
     // to avoid incomplete type errors from using Kokkos::Cuda directly.
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence before allocating UVM");
     }
 #endif
     //------------------------------------------------------------
@@ -1555,7 +1555,8 @@ class View : public ViewTraits<DataType, Properties...> {
 #if defined(KOKKOS_ENABLE_CUDA)
     if (std::is_same<Kokkos::CudaUVMSpace,
                      typename traits::device_type::memory_space>::value) {
-      typename traits::device_type::memory_space::execution_space().fence();
+      typename traits::device_type::memory_space::execution_space().fence(
+          "Kokkos::View<...>::View: fence after allocating UVM");
     }
 #endif
     //------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
index bdc8993c39..dbb557c137 100644
--- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp
@@ -213,7 +213,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_queue.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph init");
     }
 
     {  // execute-after counts
@@ -221,7 +223,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.entries.size()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "graph count");
     }
 
     {  // Scheduling ready tasks
@@ -229,7 +233,9 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits<Properties...> {
       using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
       const closure_type closure(*this, policy_type(0, m_graph.numRows()));
       closure.execute();
-      execution_space().fence();
+      execution_space().fence(
+          "Kokkos::WorkGraphPolicy::WorkGraphPolicy: fence after executing "
+          "readied graph");
     }
   }
 };
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
index e530612a57..0d521479ee 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
@@ -447,7 +447,13 @@ OpenMP OpenMP::create_instance(...) { return OpenMP(); }
 
 int OpenMP::concurrency() { return Impl::g_openmp_hardware_max_threads; }
 
-void OpenMP::fence() const {}
+void OpenMP::fence() const {
+  fence("Kokkos::OpenMP::fence: Unnamed Instance Fence");
+}
+void OpenMP::fence(const std::string &name) const {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, []() {});
+}
 
 namespace Impl {
 
@@ -474,6 +480,9 @@ void OpenMPSpaceInitializer::finalize(const bool) {
 }
 
 void OpenMPSpaceInitializer::fence() { Kokkos::OpenMP::impl_static_fence(); }
+void OpenMPSpaceInitializer::fence(const std::string &name) {
+  Kokkos::OpenMP::impl_static_fence(OpenMP(), name);
+}
 
 void OpenMPSpaceInitializer::print_configuration(std::ostream &msg,
                                                  const bool detail) {
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 82f049ed13..1191e49cbe 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -151,7 +151,14 @@ int OpenMP::impl_thread_pool_rank() noexcept {
 #endif
 }
 
-inline void OpenMP::impl_static_fence(OpenMP const& /*instance*/) noexcept {}
+inline void OpenMP::impl_static_fence(OpenMP const& /**instance*/,
+                                      const std::string& name) noexcept {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::OpenMP>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      []() {});
+}
 
 inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept {
   return false;
@@ -213,8 +220,9 @@ void OpenMP::partition_master(F const& f, int num_partitions,
 
 namespace Experimental {
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 template <>
-class MasterLock<OpenMP> {
+class KOKKOS_DEPRECATED MasterLock<OpenMP> {
  public:
   void lock() { omp_set_lock(&m_lock); }
   void unlock() { omp_unset_lock(&m_lock); }
@@ -231,6 +239,7 @@ class MasterLock<OpenMP> {
  private:
   omp_lock_t m_lock;
 };
+#endif
 
 template <>
 class UniqueToken<OpenMP, UniqueTokenScope::Instance> {
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
index 2a4a7b1d53..d9234e3419 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp
@@ -324,11 +324,6 @@ class TaskQueueSpecializationConstrained<
                 // count of 0 also. Otherwise, returns a task from another queue
                 // or `end` if one couldn't be popped
                 task = team_queue.attempt_to_steal_task();
-#if 0
-                if(task != no_more_tasks_sentinel && task != end) {
-                  std::printf("task stolen on rank %d\n", team_exec.league_rank());
-                }
-#endif
               }
 
               // If still tasks are still executing
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
index f13875b440..7ff885ed86 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
@@ -77,9 +77,10 @@ namespace Kokkos {
 namespace Impl {
 
 void OpenMPTargetExec::verify_is_process(const char* const label) {
-  if (omp_in_parallel()) {
+  // Fails if the current task is in a parallel region or is not on the host.
+  if (omp_in_parallel() && (!omp_is_initial_device())) {
     std::string msg(label);
-    msg.append(" ERROR: in parallel");
+    msg.append(" ERROR: in parallel or on device");
     Kokkos::Impl::throw_runtime_exception(msg);
   }
 }
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
index 0b65e0d4a4..ccfc756213 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
@@ -54,7 +54,10 @@
 // FIXME_OPENMPTARGET - Using this macro to implement a workaround for
 // hierarchical reducers. It avoids hitting the code path which we wanted to
 // write but doesn't work. undef'ed at the end.
+// Intel compilers prefer the non-workaround version.
+#ifndef KOKKOS_ARCH_INTEL_GPU
 #define KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -66,10 +69,6 @@ template <class Reducer>
 struct OpenMPTargetReducerWrapper {
   using value_type = typename Reducer::value_type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type&, const value_type&) {
     printf(
@@ -90,7 +89,6 @@ struct OpenMPTargetReducerWrapper {
         "Using a generic unknown Reducer for the OpenMPTarget backend is not "
         "implemented.");
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -99,10 +97,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest += src; }
@@ -116,7 +110,6 @@ struct OpenMPTargetReducerWrapper<Sum<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::sum();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -125,10 +118,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) { dest *= src; }
@@ -142,7 +131,6 @@ struct OpenMPTargetReducerWrapper<Prod<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::prod();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -151,10 +139,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -170,7 +154,6 @@ struct OpenMPTargetReducerWrapper<Min<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -179,10 +162,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -199,7 +178,6 @@ struct OpenMPTargetReducerWrapper<Max<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::max();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -208,10 +186,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     dest = dest && src;
@@ -226,7 +200,6 @@ struct OpenMPTargetReducerWrapper<LAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::land();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -237,10 +210,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
 
   using result_view_type = Kokkos::View<value_type, Space>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -256,7 +225,6 @@ struct OpenMPTargetReducerWrapper<LOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::lor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -265,10 +233,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -284,7 +248,6 @@ struct OpenMPTargetReducerWrapper<BAnd<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::band();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -293,10 +256,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   // Required
   using value_type = typename std::remove_cv<Scalar>::type;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -312,7 +271,6 @@ struct OpenMPTargetReducerWrapper<BOr<Scalar, Space>> {
   static void init(value_type& val) {
     val = reduction_identity<value_type>::bor();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -325,10 +283,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -345,7 +299,6 @@ struct OpenMPTargetReducerWrapper<MinLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::min();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -358,10 +311,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = ValLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
     if (src.val > dest.val) dest = src;
@@ -377,7 +326,6 @@ struct OpenMPTargetReducerWrapper<MaxLoc<Scalar, Index, Space>> {
     val.val = reduction_identity<scalar_type>::max();
     val.loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Space>
@@ -389,10 +337,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
   // Required
   using value_type = MinMaxScalar<scalar_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -419,7 +363,6 @@ struct OpenMPTargetReducerWrapper<MinMax<Scalar, Space>> {
     val.max_val = reduction_identity<scalar_type>::max();
     val.min_val = reduction_identity<scalar_type>::min();
   }
-#pragma omp end declare target
 };
 
 template <class Scalar, class Index, class Space>
@@ -432,10 +375,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
   // Required
   using value_type = MinMaxLocScalar<scalar_type, index_type>;
 
-// WORKAROUND OPENMPTARGET
-// This pragma omp declare target should not be necessary, but Intel compiler
-// fails without it
-#pragma omp declare target
   // Required
   KOKKOS_INLINE_FUNCTION
   static void join(value_type& dest, const value_type& src) {
@@ -468,7 +407,6 @@ struct OpenMPTargetReducerWrapper<MinMaxLoc<Scalar, Index, Space>> {
     val.max_loc = reduction_identity<index_type>::min();
     val.min_loc = reduction_identity<index_type>::min();
   }
-#pragma omp end declare target
 };
 /*
 template<class ReducerType>
@@ -560,47 +498,20 @@ class OpenMPTargetExecTeamMember {
   void* m_glb_scratch;
   void* m_reduce_scratch;
 
-  /*
-  // Fan-in team threads, root of the fan-in which does not block returns true
-  inline
-  bool team_fan_in() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) {
-
-        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
-      }
-
-      if ( m_team_rank_rev ) {
-        m_exec.state_set( Rendezvous );
-        memory_fence();
-        m_exec.state_wait( Rendezvous );
-      }
-
-      return 0 == m_team_rank_rev ;
-    }
-
-  inline
-  void team_fan_out() const
-    {
-      memory_fence();
-      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! (
-  m_team_rank_rev & n ) ; n <<= 1 ) { m_exec.pool_rev( m_team_base_rev + j
-  )->state_set( Active ); memory_fence();
-      }
-    }
-  */
  public:
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_shmem() const {
     return m_team_shared.set_team_thread_mode(0, 1, 0);
   }
 
+  // set_team_thread_mode routine parameters for future understanding:
+  // first parameter - scratch level.
+  // second parameter - size multiplier for advancing scratch ptr after a
+  // request was serviced. third parameter - offset size multiplier from current
+  // scratch ptr when returning a ptr for a request.
   KOKKOS_INLINE_FUNCTION
   const execution_space::scratch_memory_space& team_scratch(int level) const {
-    return m_team_shared.set_team_thread_mode(level, 1,
-                                              m_team_scratch_size[level]);
+    return m_team_shared.set_team_thread_mode(level, 1, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -627,8 +538,9 @@ class OpenMPTargetExecTeamMember {
     using type =
         typename std::conditional<(sizeof(ValueType) < TEAM_REDUCE_SIZE),
                                   ValueType, void>::type;
-    type* team_scratch = reinterpret_cast<type*>(
-        ((char*)(m_glb_scratch) + TEAM_REDUCE_SIZE * omp_get_team_num()));
+    type* team_scratch =
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
 #pragma omp barrier
     if (team_rank() == thread_id) *team_scratch = value;
 #pragma omp barrier
@@ -656,7 +568,8 @@ class OpenMPTargetExecTeamMember {
 
     const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type);
     type* team_scratch =
-        (type*)((char*)m_glb_scratch + TEAM_REDUCE_SIZE * omp_get_team_num());
+        reinterpret_cast<type*>(static_cast<char*>(m_glb_scratch) +
+                                TEAM_REDUCE_SIZE * omp_get_team_num());
     for (int i = m_team_rank; i < n_values; i += m_team_size) {
       team_scratch[i] = value_type();
     }
@@ -770,27 +683,24 @@ class OpenMPTargetExecTeamMember {
         m_shmem_block_index(shmem_block_index),
         m_glb_scratch(glb_scratch) {
     const int omp_tid = omp_get_thread_num();
-    m_team_shared     = scratch_memory_space(
-        ((char*)glb_scratch +
-         m_shmem_block_index *
-             (shmem_size_L0 + shmem_size_L1 +
-              ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE)),
-        shmem_size_L0,
-        ((char*)glb_scratch +
-         m_shmem_block_index * (shmem_size_L0 + shmem_size_L1 +
-                                ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-                                TEAM_REDUCE_SIZE)) +
-            shmem_size_L0 + ((shmem_size_L0 + shmem_size_L1) * 10 / 100) +
-            TEAM_REDUCE_SIZE,
-        shmem_size_L1);
-    m_reduce_scratch =
-        (char*)glb_scratch +
-        shmem_block_index *
-            (shmem_size_L0 + shmem_size_L1 +
-             ((shmem_size_L0 + shmem_size_L1) * 10 / 100) + TEAM_REDUCE_SIZE);
-    m_league_rank = league_rank;
-    m_team_rank   = omp_tid;
-    m_vector_lane = 0;
+
+    // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size
+    // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for
+    // hierarchical reduction. There is an additional 10% of the requested
+    // scratch memory allocated per team as padding. Hence the product with 0.1.
+    const int reduce_offset =
+        m_shmem_block_index *
+        (shmem_size_L0 + shmem_size_L1 +
+         ((shmem_size_L0 + shmem_size_L1) * 0.1) + TEAM_REDUCE_SIZE);
+    const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE;
+    const int l1_offset = l0_offset + shmem_size_L0;
+    m_team_shared       = scratch_memory_space(
+        (static_cast<char*>(glb_scratch) + l0_offset), shmem_size_L0,
+        static_cast<char*>(glb_scratch) + l1_offset, shmem_size_L1);
+    m_reduce_scratch = static_cast<char*>(glb_scratch) + reduce_offset;
+    m_league_rank    = league_rank;
+    m_team_rank      = omp_tid;
+    m_vector_lane    = 0;
   }
 
   static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; }
@@ -877,6 +787,16 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
   friend class TeamPolicyInternal;
 
  public:
+  // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda
+  // implementation, but this has to be tailored to be architecture specific.
+  inline static int scratch_size_max(int level) {
+    return (
+        level == 0 ? 1024 * 40 :  // 48kB is the max for CUDA, but we need some
+                                  // for team_member.reduce etc.
+            20 * 1024 *
+                1024);  // arbitrarily setting this to 20MB, for a Volta V100
+                        // that would give us about 3.2GB for 2 teams per SM
+  }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline void impl_set_team_size(const size_t size) { m_team_size = size; }
@@ -884,9 +804,11 @@ class TeamPolicyInternal<Kokkos::Experimental::OpenMPTarget, Properties...>
     m_tune_vector_length = length;
   }
   inline int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED inline int vector_length() const {
     return impl_vector_length();
   }
+#endif
   inline int team_size() const { return m_team_size; }
   inline int league_size() const { return m_league_size; }
   inline size_t scratch_size(const int& level, int team_size_ = -1) const {
@@ -1245,21 +1167,12 @@ KOKKOS_INLINE_FUNCTION
       static_cast<ValueType*>(loop_boundaries.team.impl_reduce_scratch());
 
 #pragma omp barrier
-  // These three lines all cause crash
   Impl::OpenMPTargetReducerWrapper<ReducerType>::init(TeamThread_scratch[0]);
-//  result.init(TeamThread_scratch[0]);
-//  Impl::OpenMPTargetReducerWrapper<ReducerType> red;
-//  red.init(TeamThread_scratch[0]);
 #pragma omp barrier
 
 #pragma omp for reduction(custominner : TeamThread_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp;
-    result.init(tmp);
-    lambda(i, tmp);
-    // This line causes a crash
-    Impl::OpenMPTargetReducerWrapper<ReducerType>::join(TeamThread_scratch[0],
-                                                        tmp);
+    lambda(i, TeamThread_scratch[0]);
   }
   result.reference() = TeamThread_scratch[0];
 }
@@ -1305,6 +1218,12 @@ KOKKOS_INLINE_FUNCTION
          i += team_size) {
       lambda(i, tmp2);
     }
+
+    // FIXME_OPENMPTARGET: Join should work but doesn't. Every threads gets a
+    // private TeamThread_scratch[0] and at the end of the for-loop the `join`
+    // operation is performed by OpenMP itself and hence the simple assignment
+    // works.
+    //    result.join(TeamThread_scratch[0], tmp2);
     TeamThread_scratch[0] = tmp2;
   }
 
@@ -1336,28 +1255,31 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce(
   static_assert(sizeof(ValueType) <=
                 Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE);
 
+  // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here.
+  const int value_count = 1;
+
 #pragma omp barrier
   TeamThread_scratch[0] = init_result;
 #pragma omp barrier
 
-  if constexpr (std::is_arithmetic<ValueType>::value) {
-#pragma omp for reduction(+ : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      TeamThread_scratch[0] += tmp;
-    }
-  } else {
-#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
-
-#pragma omp for reduction(custom : TeamThread_scratch[:1])
-    for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-      ValueType tmp = ValueType();
-      lambda(i, tmp);
-      join(TeamThread_scratch[0], tmp);
-    }
+#pragma omp for
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
+    lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]);
   }
 
+  // Reduce all partial results within a team.
+  const int team_size      = omp_get_num_threads();
+  int tree_neighbor_offset = 1;
+  do {
+#pragma omp for
+    for (int i = 0; i < team_size - tree_neighbor_offset;
+         i += 2 * tree_neighbor_offset) {
+      const int neighbor = i + tree_neighbor_offset;
+      join(lambda, &TeamThread_scratch[i * value_count],
+           &TeamThread_scratch[neighbor * value_count]);
+    }
+    tree_neighbor_offset *= 2;
+  } while (tree_neighbor_offset < team_size);
   init_result = TeamThread_scratch[0];
 }
 
@@ -1402,7 +1324,6 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 }
 
 }  // namespace Kokkos
-#undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 
 namespace Kokkos {
 /** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each
@@ -1530,8 +1451,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan(
 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
 #pragma ivdep
 #endif
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
-       i += loop_boundaries.increment) {
+  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
     lambda(i, scan_val, true);
   }
 }
@@ -1629,9 +1549,7 @@ KOKKOS_INLINE_FUNCTION
 
 #pragma omp for simd reduction(custom : TeamVector_scratch[:1])
   for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) {
-    ValueType tmp = ValueType();
-    lambda(i, tmp);
-    TeamVector_scratch[0] += tmp;
+    lambda(i, TeamVector_scratch[0]);
   }
 
   result.reference() = TeamVector_scratch[0];
@@ -1686,7 +1604,9 @@ KOKKOS_INLINE_FUNCTION
 #endif  // KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 }  // namespace Kokkos
 
+#ifdef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
 #undef KOKKOS_IMPL_HIERARCHICAL_REDUCERS_WORKAROUND
+#endif
 
 namespace Kokkos {
 
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
index 4a79b72732..e421edc5b4 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
@@ -59,7 +59,34 @@
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
-void OpenMPTargetInternal::fence() {}
+uint32_t OpenMPTargetInternal::impl_get_instance_id() const noexcept {
+  return m_instance_id;
+}
+
+void OpenMPTargetInternal::fence(openmp_fence_is_static is_static) {
+  fence(
+      "Kokkos::Experimental::Impl::OpenMPTargetInternal::fence: Unnamed "
+      "Internal Fence",
+      is_static);
+}
+void OpenMPTargetInternal::fence(const std::string& name,
+                                 openmp_fence_is_static is_static) {
+  if (is_static == openmp_fence_is_static::no) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
+            impl_get_instance_id()},
+        [&]() {});
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<
+        Kokkos::Experimental::OpenMPTarget>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        [&]() {});
+  }
+}
 int OpenMPTargetInternal::concurrency() { return 128000; }
 const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; }
 void OpenMPTargetInternal::print_configuration(std::ostream& /*stream*/,
@@ -77,7 +104,18 @@ void OpenMPTargetInternal::impl_finalize() {
     Kokkos::kokkos_free<Kokkos::Experimental::OpenMPTargetSpace>(
         space.m_uniquetoken_ptr);
 }
-void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; }
+void OpenMPTargetInternal::impl_initialize() {
+  m_is_initialized = true;
+
+  // FIXME_OPENMPTARGET:  Only fix the number of teams for NVIDIA architectures
+  // from Pascal and upwards.
+#if defined(KOKKOS_ARCH_PASCAL) || defined(KOKKOS_ARCH_VOLTA) || \
+    defined(KOKKOS_ARCH_TURING75) || defined(KOKKOS_ARCH_AMPERE)
+#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1300)
+  omp_set_num_teams(512);
+#endif
+#endif
+}
 int OpenMPTargetInternal::impl_is_initialized() {
   return m_is_initialized ? 1 : 0;
 }
@@ -100,11 +138,28 @@ void OpenMPTarget::print_configuration(std::ostream& stream,
   m_space_instance->print_configuration(stream, detail);
 }
 
+uint32_t OpenMPTarget::impl_instance_id() const noexcept {
+  return m_space_instance->impl_get_instance_id();
+}
+
 int OpenMPTarget::concurrency() {
   return Impl::OpenMPTargetInternal::impl_singleton()->concurrency();
 }
 void OpenMPTarget::fence() {
-  Impl::OpenMPTargetInternal::impl_singleton()->fence();
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence");
+}
+void OpenMPTarget::fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(name);
+}
+void OpenMPTarget::impl_static_fence() {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence",
+      Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
+}
+void OpenMPTarget::impl_static_fence(const std::string& name) {
+  Impl::OpenMPTargetInternal::impl_singleton()->fence(
+      name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes);
 }
 
 void OpenMPTarget::impl_initialize() { m_space_instance->impl_initialize(); }
@@ -146,7 +201,10 @@ void OpenMPTargetSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void OpenMPTargetSpaceInitializer::fence() {
-  Kokkos::Experimental::OpenMPTarget::fence();
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence();
+}
+void OpenMPTargetSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::OpenMPTarget::impl_static_fence(name);
 }
 
 void OpenMPTargetSpaceInitializer::print_configuration(std::ostream& msg,
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
index a1caf90c19..b495771190 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp
@@ -51,6 +51,8 @@ namespace Kokkos {
 namespace Experimental {
 namespace Impl {
 
+enum class openmp_fence_is_static { yes, no };
+
 class OpenMPTargetInternal {
  private:
   OpenMPTargetInternal()                            = default;
@@ -58,7 +60,9 @@ class OpenMPTargetInternal {
   OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default;
 
  public:
-  void fence();
+  void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no);
+  void fence(const std::string& name,
+             openmp_fence_is_static is_static = openmp_fence_is_static::no);
 
   /** \brief  Return the maximum amount of concurrency.  */
   int concurrency();
@@ -73,14 +77,16 @@ class OpenMPTargetInternal {
 
   //! Has been initialized
   int impl_is_initialized();
-
+  uint32_t impl_get_instance_id() const noexcept;
   //! Initialize, telling the CUDA run-time library which device to use.
   void impl_initialize();
 
   static OpenMPTargetInternal* impl_singleton();
 
  private:
-  bool m_is_initialized = false;
+  bool m_is_initialized  = false;
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::OpenMPTarget>(reinterpret_cast<uintptr_t>(this));
 };
 }  // Namespace Impl
 }  // Namespace Experimental
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
index a4092c3a37..08a3109408 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp
@@ -51,8 +51,6 @@
 #include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
 #include <impl/Kokkos_FunctorAdapter.hpp>
 
-#define KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-
 namespace Kokkos {
 namespace Impl {
 
@@ -69,24 +67,10 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
  public:
-  inline void execute() const { execute_impl<WorkTag>(); }
-  /*
-    template <class TagType>
-    inline typename std::enable_if<std::is_same<TagType, void>::value>::type
-    execute_impl() const {
-      OpenMPTargetExec::verify_is_process(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      OpenMPTargetExec::verify_initialized(
-          "Kokkos::Experimental::OpenMPTarget parallel_for");
-      const typename Policy::member_type begin = m_policy.begin();
-      const typename Policy::member_type end   = m_policy.end();
+  void execute() const { execute_impl<WorkTag>(); }
 
-  #pragma omp target teams distribute parallel for map(to: this->m_functor)
-      for (int i = begin; i < end; i++) m_functor(i);
-    }
-  */
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -98,16 +82,17 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 
     FunctorType a_functor(m_functor);
 
-    if constexpr (std::is_same<TagType, void>::value) {
 #pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(i);
-    } else {
-#pragma omp target teams distribute parallel for map(to : a_functor)
-      for (auto i = begin; i < end; i++) a_functor(TagType(), i);
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        a_functor(i);
+      } else {
+        a_functor(TagType(), i);
+      }
     }
   }
 
-  inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
+  ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 };
 
@@ -120,12 +105,31 @@ class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
+// This class has the memcpy routine that is commonly used by ParallelReduce
+// over RangePolicy and TeamPolicy.
+template <class PointerType>
+struct ParallelReduceCommon {
+  // Copy the result back to device if the view is on the device.
+  static void memcpy_result(PointerType dest, PointerType src, size_t size,
+                            bool ptr_on_device) {
+    if (ptr_on_device) {
+      OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0,
+                                       omp_get_default_device(),
+                                       omp_get_initial_device()));
+    } else {
+      *dest = *src;
+    }
+  }
+};
+
 template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType, bool FunctorHasJoin,
-          bool UseReducerType>
+          class PointerType, class ValueType>
 struct ParallelReduceSpecialize {
-  static inline void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
+  inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/,
                              PointerType /*result_ptr*/) {
+    constexpr int FunctorHasJoin = ReduceFunctorHasJoin<FunctorType>::value;
+    constexpr int UseReducerType = is_reducer_type<ReducerType>::value;
+
     std::stringstream error_message;
     error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' '
                   << UseReducerType << '\n';
@@ -137,12 +141,26 @@ struct ParallelReduceSpecialize {
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
+
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -153,69 +171,220 @@ struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>,
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(i, result);
-    } else {
-#pragma omp target teams distribute parallel for num_teams(512) \
-                map(to:f) map(tofrom:result) reduction(+: result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-    }
 
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-
-template <class FunctorType, class PolicyType, class ReducerType,
-          class PointerType, class ValueType>
-struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType,
-                                PointerType, ValueType, false, true> {
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+    OpenMPTargetReducerWrapper<ReducerType>::init(result);
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+    for (auto i = begin; i < end; ++i) {
+      if constexpr (std::is_same<TagType, void>::value) {
+        f(i, result);
+      } else {
+        f(TagType(), i, result);
+      }
+    }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  template <class TagType, int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
+    const auto begin = p.begin();
+    const auto end   = p.end();
 
     if (end <= begin) return;
 
     ValueType result = ValueType();
-    OpenMPTargetReducerWrapper<ReducerType>::init(result);
 
-    if constexpr (std::is_same<TagType, void>::value) {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(i, result);
-      *result_ptr = result;
+    // Enter the loop if the reduction is on a scalar type.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams distribute parallel for \
+         map(to:f) reduction(+: result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      } else {
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams distribute parallel for map(to                    \
+                                                     : f) reduction(custom \
+                                                                    : result)
+        for (auto i = begin; i < end; ++i)
+
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+      }
     } else {
-#pragma omp target teams distribute parallel for num_teams(512) map(to   \
-                                                                    : f) \
-    reduction(custom                                                     \
-              : result)
-      for (auto i = begin; i < end; i++) f(TagType(), i, result);
-      *result_ptr = result;
+#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions])
+      for (auto i = begin; i < end; ++i) {
+        if constexpr (std::is_same<TagType, void>::value) {
+          f(i, result);
+        } else {
+          f(TagType(), i, result);
+        }
+      }
     }
+
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin = p.begin();
+    const auto end   = p.end();
+
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    // Initialize the result pointer.
+
+    const auto size = end - begin;
+
+    // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently
+    // based on NVIDIA-V100 and should be modifid to be based on the
+    // architecture in the future.
+    const int max_team_threads = 32;
+    const int max_teams =
+        OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads;
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread. Achieved by setting the first
+    // parameter of `resize_scratch=1`.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    ValueType* scratch_ptr =
+        static_cast<ValueType*>(OpenMPTargetExec::get_scratch_ptr());
+
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+    {
+      // Enter this loop if the functor has an `init`
+      if constexpr (HasInit) {
+        // The `init` routine needs to be called on the device since it might
+        // need device members.
+        ValueInit::init(f, scratch_ptr);
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      } else {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \
+    map(to                                                                   \
+        : f) is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num    = omp_get_team_num();
+        const int num_teams   = omp_get_num_teams();
+        const auto chunk_size = size / num_teams;
+        const auto team_begin = begin + team_num * chunk_size;
+        const auto team_end =
+            (team_num == num_teams - 1) ? end : (team_begin + chunk_size);
+        ValueType* team_scratch =
+            scratch_ptr + team_num * max_team_threads * value_count;
+        ReferenceType result = ValueInit::init(
+            f, &team_scratch[omp_get_thread_num() * value_count]);
+
+        // Accumulate partial results in thread specific storage.
+#pragma omp for simd
+        for (auto i = team_begin; i < team_end; ++i) {
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(i, result);
+          } else {
+            f(TagType(), i, result);
+          }
+        }
+
+        // Reduce all paritial results within a team.
+        const int team_size      = max_team_threads;
+        int tree_neighbor_offset = 1;
+        do {
+#pragma omp for simd
+          for (int i = 0; i < team_size - tree_neighbor_offset;
+               i += 2 * tree_neighbor_offset) {
+            const int neighbor = i + tree_neighbor_offset;
+            ValueJoin::join(f, &team_scratch[i * value_count],
+                            &team_scratch[neighbor * value_count]);
+          }
+          tree_neighbor_offset *= 2;
+        } while (tree_neighbor_offset < team_size);
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < max_teams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = max_team_threads * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= max_teams &&
+              omp_get_team_num() == 0 && omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < max_teams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -227,47 +396,77 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
 
   using WorkTag   = typename Policy::work_tag;
   using WorkRange = typename Policy::WorkRange;
-  using Member    = typename Policy::member_type;
 
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
 
-  // Static Assert WorkTag void if ReducerType not InvalidType
-
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
-
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
 
   using pointer_type   = typename ValueTraits::pointer_type;
   using reference_type = typename ValueTraits::reference_type;
 
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
   using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
+  using TagType = typename Policy::work_tag;
 
  public:
-  inline void execute() const {
-    ParReduceSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      // Enter this loop if the Functor has a init-join.
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      // Enter this loop if the Functor is a reducer type.
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      // Enter this loop if the reduction is on an array and the routine is
+      // templated over the size of the array.
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<TagType, 2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<TagType, 4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<TagType, 8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<TagType, 16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<TagType, 32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      // This loop handles the basic scalar reduction.
+      ParReduceSpecialize::template execute_array<TagType, 1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
-      const FunctorType& arg_functor, Policy arg_policy,
+  ParallelReduce(
+      const FunctorType& arg_functor, Policy& arg_policy,
       const ViewType& arg_result_view,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
@@ -275,14 +474,23 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {}
+        m_result_ptr(arg_result_view.data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result_view.size()) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()) {}
 };
 
 }  // namespace Impl
@@ -318,20 +526,20 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
   const Policy m_policy;
 
   template <class TagType>
-  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
+  typename std::enable_if<std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(idx, val, is_final);
   }
   template <class TagType>
-  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
+  typename std::enable_if<!std::is_same<TagType, void>::value>::type
   call_with_tag(const FunctorType& f, const idx_type& idx, value_type& val,
                 const bool& is_final) const {
     f(WorkTag(), idx, val, is_final);
   }
 
  public:
-  inline void impl_execute(
+  void impl_execute(
       Kokkos::View<value_type**, Kokkos::LayoutRight,
                    Kokkos::Experimental::OpenMPTargetSpace>
           element_values,
@@ -349,13 +557,13 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type val;
           ValueInit::init(a_functor, &val);
@@ -366,7 +574,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
         if (omp_get_thread_num() == 0) {
           value_type sum;
           ValueInit::init(a_functor, &sum);
-          for (idx_type i = 0; i < chunk_size; i++) {
+          for (idx_type i = 0; i < chunk_size; ++i) {
             ValueJoin::join(a_functor, &sum, &element_values(team_id, i));
             element_values(team_id, i) = sum;
           }
@@ -377,7 +585,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) {
             value_type sum;
             ValueInit::init(a_functor, &sum);
-            for (idx_type i = 0; i < n_chunks; i++) {
+            for (idx_type i = 0; i < n_chunks; ++i) {
               ValueJoin::join(a_functor, &sum, &chunk_values(i));
               chunk_values(i) = sum;
             }
@@ -389,7 +597,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 #pragma omp target teams distribute map(to                             \
                                         : a_functor) num_teams(nteams) \
     thread_limit(team_size)
-    for (idx_type team_id = 0; team_id < n_chunks; team_id++) {
+    for (idx_type team_id = 0; team_id < n_chunks; ++team_id) {
 #pragma omp parallel num_threads(team_size)
       {
         const idx_type local_offset = team_id * chunk_size;
@@ -400,7 +608,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
           ValueInit::init(a_functor, &offset_value);
 
 #pragma omp for
-        for (idx_type i = 0; i < chunk_size; i++) {
+        for (idx_type i = 0; i < chunk_size; ++i) {
           const idx_type idx = local_offset + i;
           value_type local_offset_value;
           if (i > 0) {
@@ -415,7 +623,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
     }
   }
 
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -438,7 +646,7 @@ class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
 
   //----------------------------------------
 
-  inline ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
 
   //----------------------------------------
@@ -455,7 +663,7 @@ class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
   value_type& m_returnvalue;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -513,7 +721,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
+  void execute() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -523,7 +731,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
  private:
   template <class TagType>
-  inline void execute_impl() const {
+  void execute_impl() const {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -549,7 +757,6 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 // Performing our own scheduling of teams to avoid separation of code between
 // teams-distribute and parallel. Gave a 2x performance boost in test cases with
 // the clang compiler. atomic_compare_exchange can be avoided since the standard
@@ -580,49 +787,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       } else
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
-
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute map(to                   \
-                                        : a_functor)         \
-    is_device_ptr(scratch_ptr, lock_array) num_teams(nteams) \
-        thread_limit(team_size)
-    for (int i = 0; i < league_size; i++) {
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-
-#pragma omp parallel num_threads(team_size)
-      {
-        typename Policy::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        m_functor(team);
-      }
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-#endif
   }
 
  public:
-  inline ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
+  ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy)
       : m_functor(arg_functor),
         m_policy(arg_policy),
         m_shmem_size(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
@@ -633,13 +801,26 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 template <class FunctorType, class ReducerType, class PointerType,
           class ValueType, class... PolicyArgs>
 struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                false> {
+                                ReducerType, PointerType, ValueType> {
   using PolicyType = TeamPolicyInternal<PolicyArgs...>;
+  using TagType    = typename PolicyType::work_tag;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
+  using WorkTagFwd =
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, TagType,
+                         void>;
 
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
+  using ValueTraits =
+      Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
+  using ValueInit     = Kokkos::Impl::FunctorValueInit<FunctorType, TagType>;
+  using ValueJoin     = Kokkos::Impl::FunctorValueJoin<FunctorType, TagType>;
+  using ReferenceType = typename ValueTraits::reference_type;
+
+  using ParReduceCommon = ParallelReduceCommon<PointerType>;
+
+  static void execute_reducer(const FunctorType& f, const PolicyType& p,
+                              PointerType result_ptr, bool ptr_on_device) {
     OpenMPTargetExec::verify_is_process(
         "Kokkos::Experimental::OpenMPTarget parallel_for");
     OpenMPTargetExec::verify_initialized(
@@ -662,112 +843,10 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
     const auto nteams =
         league_size < max_active_teams ? league_size : max_active_teams;
 
-#ifdef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
-#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
-                                                                       : f) \
-    is_device_ptr(scratch_ptr) reduction(+: result)
-#pragma omp parallel reduction(+ : result)
-    {
-      const int blockIdx = omp_get_team_num();
-      const int gridDim  = omp_get_num_teams();
-
-      // Guarantee that the compilers respect the `num_teams` clause
-      if (gridDim <= nteams) {
-        for (int league_id = blockIdx; league_id < league_size;
-             league_id += gridDim) {
-          typename PolicyType::member_type team(
-              league_id, league_size, team_size, vector_length, scratch_ptr,
-              blockIdx, shmem_size_L0, shmem_size_L1);
-          if constexpr (std::is_same<TagType, void>::value)
-            f(team, result);
-          else
-            f(TagType(), team, result);
-        }
-      } else
-        Kokkos::abort("`num_teams` clause was not respected.\n");
-    }
-
-    *result_ptr = result;
-#else
-// Saving the older implementation that uses `atomic_compare_exchange` to
-// calculate the shared memory block index and `distribute` clause to distribute
-// teams.
-#pragma omp target teams distribute num_teams(nteams) thread_limit(team_size) \
-         map(to:f) map(tofrom:result) reduction(+: result) \
-    is_device_ptr(scratch_ptr, lock_array)
-    for (int i = 0; i < league_size; i++) {
-      ValueType inner_result = ValueType();
-      int shmem_block_index = -1, lock_team = 99999, iter = -1;
-      iter = (omp_get_team_num() % max_active_teams);
-
-      // Loop as long as a shmem_block_index is not found.
-      while (shmem_block_index == -1) {
-        // Try and acquire a lock on the index.
-        lock_team = atomic_compare_exchange(&lock_array[iter], 0, 1);
-
-        // If lock is acquired assign it to the block index.
-        // lock_team = 0, implies atomic_compare_exchange is successfull.
-        if (lock_team == 0)
-          shmem_block_index = iter;
-        else
-          iter = ++iter % max_active_teams;
-      }
-#pragma omp parallel num_threads(team_size) reduction(+ : inner_result)
-      {
-        typename PolicyType::member_type team(
-            i, league_size, team_size, vector_length, scratch_ptr,
-            shmem_block_index, shmem_size_L0, shmem_size_L1);
-        f(team, inner_result);
-      }
-      result = inner_result;
-
-      // Free the locked block and increment the number of available free
-      // blocks.
-      lock_team = atomic_compare_exchange(&lock_array[shmem_block_index], 1, 0);
-    }
-
-    *result_ptr = result;
-#endif
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, false,
-                                true> {
-  using PolicyType = TeamPolicyInternal<PolicyArgs...>;
-  template <class TagType>
-  inline static void execute_impl(const FunctorType& f, const PolicyType& p,
-                                  PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-
 #pragma omp declare reduction(                                         \
     custom:ValueType                                                   \
     : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
     initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
-    const int league_size      = p.league_size();
-    const int team_size        = p.team_size();
-    const int vector_length    = p.impl_vector_length();
-    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
-    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
-    OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1);
-    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
-
-    ValueType result = ValueType();
-
-    // Maximum active teams possible.
-    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
-    const auto nteams =
-        league_size < max_active_teams ? league_size : max_active_teams;
 
 #pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
                                                                        : f) \
@@ -794,12 +873,259 @@ struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>,
         Kokkos::abort("`num_teams` clause was not respected.\n");
     }
 
-    *result_ptr = result;
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
   }
 
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
+  template <int NumReductions>
+  static void execute_array(const FunctorType& f, const PolicyType& p,
+                            PointerType result_ptr, bool ptr_on_device) {
+    OpenMPTargetExec::verify_is_process(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+    OpenMPTargetExec::verify_initialized(
+        "Kokkos::Experimental::OpenMPTarget parallel_for");
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+    OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE,
+                                     shmem_size_L0, shmem_size_L1);
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    ValueType result = ValueType();
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Case where the number of reduction items is 1.
+    if constexpr (NumReductions == 1) {
+      // Case where reduction is on a native data type.
+      if constexpr (std::is_arithmetic<ValueType>::value) {
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+: result)
+#pragma omp parallel reduction(+ : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      } else {
+        // Case where the reduction is on a non-native data type.
+#pragma omp declare reduction(custom:ValueType : omp_out += omp_in)
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(custom                             \
+                                         : result)
+#pragma omp parallel reduction(custom : result)
+        {
+          const int blockIdx = omp_get_team_num();
+          const int gridDim  = omp_get_num_teams();
+
+          // Guarantee that the compilers respect the `num_teams` clause
+          if (gridDim <= nteams) {
+            for (int league_id = blockIdx; league_id < league_size;
+                 league_id += gridDim) {
+              typename PolicyType::member_type team(
+                  league_id, league_size, team_size, vector_length, scratch_ptr,
+                  blockIdx, shmem_size_L0, shmem_size_L1);
+              if constexpr (std::is_same<TagType, void>::value)
+                f(team, result);
+              else
+                f(TagType(), team, result);
+            }
+          } else
+            Kokkos::abort("`num_teams` clause was not respected.\n");
+        }
+      }
+    } else {
+      // Case where the reduction is on an array.
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions])
+#pragma omp parallel reduction(+ : result[:NumReductions])
+      {
+        const int blockIdx = omp_get_team_num();
+        const int gridDim  = omp_get_num_teams();
+
+        // Guarantee that the compilers respect the `num_teams` clause
+        if (gridDim <= nteams) {
+          for (int league_id = blockIdx; league_id < league_size;
+               league_id += gridDim) {
+            typename PolicyType::member_type team(
+                league_id, league_size, team_size, vector_length, scratch_ptr,
+                blockIdx, shmem_size_L0, shmem_size_L1);
+            if constexpr (std::is_same<TagType, void>::value)
+              f(team, result);
+            else
+              f(TagType(), team, result);
+          }
+        } else
+          Kokkos::abort("`num_teams` clause was not respected.\n");
+      }
+    }
+
+    // Copy results back to device if `parallel_reduce` is on a device view.
+    ParReduceCommon::memcpy_result(result_ptr, &result, sizeof(ValueType),
+                                   ptr_on_device);
+  }
+
+  // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over
+  // RangePolicy. Need a new implementation.
+  static void execute_init_join(const FunctorType& f, const PolicyType& p,
+                                PointerType ptr, const bool ptr_on_device) {
+    const auto begin      = p.begin();
+    const auto end        = p.end();
+    constexpr int HasInit = ReduceFunctorHasInit<FunctorType>::value;
+
+    const auto size = end - begin;
+
+    const int league_size   = p.league_size();
+    const int team_size     = p.team_size();
+    const int vector_length = p.impl_vector_length();
+
+    const size_t shmem_size_L0 = p.scratch_size(0, team_size);
+    const size_t shmem_size_L1 = p.scratch_size(1, team_size);
+
+    // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are
+    // already using the available scratch memory to create temporaries for each
+    // thread.
+    if constexpr ((shmem_size_L0 + shmem_size_L1) > 0) {
+      Kokkos::abort(
+          "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` "
+          "over functors with init/join.");
+    }
+
+    // Maximum active teams possible.
+    int max_active_teams = OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size;
+    const auto nteams =
+        league_size < max_active_teams ? league_size : max_active_teams;
+
+    // Number of elements in the reduction
+    const auto value_count =
+        FunctorValueTraits<FunctorType, TagType>::value_count(f);
+
+    // Allocate scratch per active thread.
+    OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType));
+    void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
+
+    // Enter this loop if the functor has an `init`
+    if constexpr (HasInit) {
+      // The `init` routine needs to be called on the device since it might need
+      // device members.
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        ValueInit::init(f, scratch_ptr);
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    } else {
+#pragma omp target map(to : f) is_device_ptr(scratch_ptr)
+      {
+        for (int i = 0; i < value_count; ++i) {
+          static_cast<ValueType*>(scratch_ptr)[i] = ValueType();
+        }
+
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+      }
+    }
+
+    if (end <= begin) {
+      // If there is no work to be done, copy back the initialized values and
+      // exit.
+      if (!ptr_on_device)
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_initial_device(), omp_get_default_device()));
+      else
+        OMPT_SAFE_CALL(omp_target_memcpy(
+            ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+            omp_get_default_device(), omp_get_default_device()));
+
+      return;
+    }
+
+#pragma omp target teams num_teams(nteams) thread_limit(team_size) map(to   \
+                                                                       : f) \
+    is_device_ptr(scratch_ptr)
+    {
+#pragma omp parallel
+      {
+        const int team_num      = omp_get_team_num();
+        const int num_teams     = omp_get_num_teams();
+        ValueType* team_scratch = static_cast<ValueType*>(scratch_ptr) +
+                                  team_num * team_size * value_count;
+        ReferenceType result = ValueInit::init(f, &team_scratch[0]);
+
+        for (int league_id = team_num; league_id < league_size;
+             league_id += num_teams) {
+          typename PolicyType::member_type team(
+              league_id, league_size, team_size, vector_length, scratch_ptr,
+              team_num, shmem_size_L0, shmem_size_L1);
+          if constexpr (std::is_same<TagType, void>::value) {
+            f(team, result);
+          } else {
+            f(TagType(), team, result);
+          }
+        }
+      }  // end parallel
+    }    // end target
+
+    int tree_neighbor_offset = 1;
+    do {
+#pragma omp target teams distribute parallel for simd map(to   \
+                                                          : f) \
+    is_device_ptr(scratch_ptr)
+      for (int i = 0; i < nteams - tree_neighbor_offset;
+           i += 2 * tree_neighbor_offset) {
+        ValueType* team_scratch = scratch_ptr;
+        const int team_offset   = team_size * value_count;
+        ValueJoin::join(
+            f, &team_scratch[i * team_offset],
+            &team_scratch[(i + tree_neighbor_offset) * team_offset]);
+
+        // If `final` is provided by the functor.
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value) {
+          // Do the final only once at the end.
+          if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 &&
+              omp_get_thread_num() == 0)
+            FunctorFinal<FunctorType, TagType>::final(f, scratch_ptr);
+        }
+      }
+      tree_neighbor_offset *= 2;
+    } while (tree_neighbor_offset < nteams);
+
+    // If the result view is on the host, copy back the values via memcpy.
+    if (!ptr_on_device)
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_initial_device(), omp_get_default_device()));
+    else
+      OMPT_SAFE_CALL(omp_target_memcpy(
+          ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0,
+          omp_get_default_device(), omp_get_default_device()));
   }
 };
 
@@ -813,11 +1139,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
   using WorkTag = typename Policy::work_tag;
   using Member  = typename Policy::member_type;
-
-  using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
-  using ReducerTypeFwd = typename ReducerConditional::type;
+  using ReducerTypeFwd =
+      typename std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                                FunctorType, ReducerType>::type;
   using WorkTagFwd =
       std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
                          void>;
@@ -831,13 +1155,16 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   using reference_type = typename ValueTraits::reference_type;
   using value_type     = typename ValueTraits::value_type;
 
-  enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
-  enum { UseReducer = is_reducer_type<ReducerType>::value };
+  bool m_result_ptr_on_device;
+  const int m_result_ptr_num_elems;
 
-  using ParForSpecialize =
+  static constexpr int HasJoin    = ReduceFunctorHasJoin<FunctorType>::value;
+  static constexpr int UseReducer = is_reducer_type<ReducerType>::value;
+  static constexpr int IsArray    = std::is_pointer<reference_type>::value;
+
+  using ParReduceSpecialize =
       ParallelReduceSpecialize<FunctorType, Policy, ReducerType, pointer_type,
-                               typename ValueTraits::value_type, HasJoin,
-                               UseReducer>;
+                               typename ValueTraits::value_type>;
 
   const FunctorType m_functor;
   const Policy m_policy;
@@ -846,18 +1173,50 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const int m_shmem_size;
 
  public:
-  inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+  void execute() const {
+    if constexpr (HasJoin) {
+      ParReduceSpecialize::execute_init_join(m_functor, m_policy, m_result_ptr,
+                                             m_result_ptr_on_device);
+    } else if constexpr (UseReducer) {
+      ParReduceSpecialize::execute_reducer(m_functor, m_policy, m_result_ptr,
+                                           m_result_ptr_on_device);
+    } else if constexpr (IsArray) {
+      if (m_result_ptr_num_elems <= 2) {
+        ParReduceSpecialize::template execute_array<2>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 4) {
+        ParReduceSpecialize::template execute_array<4>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 8) {
+        ParReduceSpecialize::template execute_array<8>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 16) {
+        ParReduceSpecialize::template execute_array<16>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else if (m_result_ptr_num_elems <= 32) {
+        ParReduceSpecialize::template execute_array<32>(
+            m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+      } else {
+        Kokkos::abort("array reduction length must be <= 32");
+      }
+    } else {
+      ParReduceSpecialize::template execute_array<1>(
+          m_functor, m_policy, m_result_ptr, m_result_ptr_on_device);
+    }
   }
 
   template <class ViewType>
-  inline ParallelReduce(
+  ParallelReduce(
       const FunctorType& arg_functor, const Policy& arg_policy,
       const ViewType& arg_result,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = nullptr)
-      : m_functor(arg_functor),
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible),
+        m_result_ptr_num_elems(arg_result.size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
@@ -865,9 +1224,14 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                      FunctorTeamShmemSize<FunctorType>::value(
                          arg_functor, arg_policy.team_size())) {}
 
-  inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
-                        const ReducerType& reducer)
-      : m_functor(arg_functor),
+  ParallelReduce(const FunctorType& arg_functor, Policy& arg_policy,
+                 const ReducerType& reducer)
+      : m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
+        m_result_ptr_num_elems(reducer.view().size()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
@@ -889,11 +1253,11 @@ struct TeamThreadRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const iType end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType count)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType count)
       : start(0), end(count), team(thread_) {}
-  inline TeamThreadRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, iType begin_, iType end_)
+  TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  iType begin_, iType end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -904,12 +1268,11 @@ struct ThreadVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type count)
       : start(0), end(count), team(thread_) {}
-  inline ThreadVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  ThreadVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                    index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -920,12 +1283,11 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
   const index_type end;
   const OpenMPTargetExecTeamMember& team;
 
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type count)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type count)
       : start(0), end(count), team(thread_) {}
-  inline TeamVectorRangeBoundariesStruct(
-      const OpenMPTargetExecTeamMember& thread_, index_type begin_,
-      index_type end_)
+  TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& thread_,
+                                  index_type begin_, index_type end_)
       : start(begin_), end(end_), team(thread_) {}
 };
 
@@ -935,5 +1297,4 @@ struct TeamVectorRangeBoundariesStruct<iType, OpenMPTargetExecTeamMember> {
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-#undef KOKKOS_IMPL_LOCK_FREE_HIERARCHICAL
 #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
index 3dfad2bb85..40d8c45f5d 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_MDRange.hpp
@@ -91,7 +91,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
 #pragma omp target teams distribute map(to : functor) num_teams(end - begin)
     {
-      for (ptrdiff_t tile_idx = begin; tile_idx < end; tile_idx++) {
+      for (ptrdiff_t tile_idx = begin; tile_idx < end; ++tile_idx) {
 
 #pragma omp parallel
         {
@@ -116,31 +116,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 1>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const auto begin_0 = policy.m_lower[0];
-
-    const auto end_0 = policy.m_upper[0];
-
-#pragma omp target teams distribute parallel for map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-#pragma omp for
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      functor(i0);
-    }
-#endif
-  }
-
   template <int Rank>
   inline typename std::enable_if<Rank == 2>::type execute_tile(
       typename Policy::point_type offset, const FunctorType& functor,
@@ -154,8 +129,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_1 = policy.m_upper[1];
 
 #pragma omp target teams distribute parallel for collapse(2) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -172,8 +147,8 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
 
 #pragma omp for collapse(2)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1) {
         if constexpr (std::is_same<typename Policy::work_tag, void>::value)
           functor(i0, i1);
         else
@@ -197,9 +172,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_2 = policy.m_upper[2];
 
 #pragma omp target teams distribute parallel for collapse(3) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -221,9 +196,9 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
 
 #pragma omp for collapse(3)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2) {
           if constexpr (std::is_same<typename Policy::work_tag, void>::value)
             functor(i0, i1, i2);
           else
@@ -249,10 +224,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_3 = policy.m_upper[3];
 
 #pragma omp target teams distribute parallel for collapse(4) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -279,10 +254,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
 
 #pragma omp for collapse(4)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3) {
             if constexpr (std::is_same<typename Policy::work_tag, void>::value)
               functor(i0, i1, i2, i3);
             else
@@ -310,11 +285,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_4 = policy.m_upper[4];
 
 #pragma omp target teams distribute parallel for collapse(5) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -347,11 +322,11 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
 
 #pragma omp for collapse(5)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4) {
               if constexpr (std::is_same<typename Policy::work_tag,
                                          void>::value)
                 functor(i0, i1, i2, i3, i4);
@@ -382,12 +357,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     const auto end_5 = policy.m_upper[5];
 
 #pragma omp target teams distribute parallel for collapse(6) map(to : functor)
-    for (auto i0 = begin_0; i0 < end_0; i0++) {
-      for (auto i1 = begin_1; i1 < end_1; i1++) {
-        for (auto i2 = begin_2; i2 < end_2; i2++) {
-          for (auto i3 = begin_3; i3 < end_3; i3++) {
-            for (auto i4 = begin_4; i4 < end_4; i4++) {
-              for (auto i5 = begin_5; i5 < end_5; i5++) {
+    for (auto i0 = begin_0; i0 < end_0; ++i0) {
+      for (auto i1 = begin_1; i1 < end_1; ++i1) {
+        for (auto i2 = begin_2; i2 < end_2; ++i2) {
+          for (auto i3 = begin_3; i3 < end_3; ++i3) {
+            for (auto i4 = begin_4; i4 < end_4; ++i4) {
+              for (auto i5 = begin_5; i5 < end_5; ++i5) {
                 {
                   if constexpr (std::is_same<typename Policy::work_tag,
                                              void>::value)
@@ -428,12 +403,12 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
     end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
 
 #pragma omp for collapse(6)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
+    for (ptrdiff_t i0 = begin_0; i0 < end_0; ++i0)
+      for (ptrdiff_t i1 = begin_1; i1 < end_1; ++i1)
+        for (ptrdiff_t i2 = begin_2; i2 < end_2; ++i2)
+          for (ptrdiff_t i3 = begin_3; i3 < end_3; ++i3)
+            for (ptrdiff_t i4 = begin_4; i4 < end_4; ++i4)
+              for (ptrdiff_t i5 = begin_5; i5 < end_5; ++i5) {
                 if constexpr (std::is_same<typename Policy::work_tag,
                                            void>::value)
                   functor(i0, i1, i2, i3, i4, i5);
@@ -443,195 +418,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 #endif
   }
 
-  template <int Rank>
-  inline typename std::enable_if<Rank == 7>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-
-#pragma omp target teams distribute parallel for collapse(7) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-#pragma omp for collapse(7)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  if constexpr (std::is_same<typename Policy::work_tag,
-                                             void>::value)
-                    functor(i0, i1, i2, i3, i4, i5, i6);
-                  else
-                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
-                            i6);
-                }
-#endif
-  }
-
-  template <int Rank>
-  inline typename std::enable_if<Rank == 8>::type execute_tile(
-      typename Policy::point_type offset, const FunctorType& functor,
-      const Policy& policy) const {
-#ifdef KOKKOS_IMPL_MDRANGE_USE_NO_TILES
-    (void)offset;
-    const int begin_0 = policy.m_lower[0];
-    const int begin_1 = policy.m_lower[1];
-    const int begin_2 = policy.m_lower[2];
-    const int begin_3 = policy.m_lower[3];
-    const int begin_4 = policy.m_lower[4];
-    const int begin_5 = policy.m_lower[5];
-    const int begin_6 = policy.m_lower[6];
-    const int begin_7 = policy.m_lower[7];
-
-    const int end_0 = policy.m_upper[0];
-    const int end_1 = policy.m_upper[1];
-    const int end_2 = policy.m_upper[2];
-    const int end_3 = policy.m_upper[3];
-    const int end_4 = policy.m_upper[4];
-    const int end_5 = policy.m_upper[5];
-    const int end_6 = policy.m_upper[6];
-    const int end_7 = policy.m_upper[7];
-
-#pragma omp target teams distribute parallel for collapse(8) map(to : functor)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++) {
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++) {
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++) {
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++) {
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++) {
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++) {
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++) {
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-#else
-    const ptrdiff_t begin_0 = offset[0];
-    ptrdiff_t end_0         = begin_0 + policy.m_tile[0];
-    end_0 = end_0 < policy.m_upper[0] ? end_0 : policy.m_upper[0];
-
-    const ptrdiff_t begin_1 = offset[1];
-    ptrdiff_t end_1         = begin_1 + policy.m_tile[1];
-    end_1 = end_1 < policy.m_upper[1] ? end_1 : policy.m_upper[1];
-
-    const ptrdiff_t begin_2 = offset[2];
-    ptrdiff_t end_2         = begin_2 + policy.m_tile[2];
-    end_2 = end_2 < policy.m_upper[2] ? end_2 : policy.m_upper[2];
-
-    const ptrdiff_t begin_3 = offset[3];
-    ptrdiff_t end_3         = begin_3 + policy.m_tile[3];
-    end_3 = end_3 < policy.m_upper[3] ? end_3 : policy.m_upper[3];
-
-    const ptrdiff_t begin_4 = offset[4];
-    ptrdiff_t end_4         = begin_4 + policy.m_tile[4];
-    end_4 = end_4 < policy.m_upper[4] ? end_4 : policy.m_upper[4];
-
-    const ptrdiff_t begin_5 = offset[5];
-    ptrdiff_t end_5         = begin_5 + policy.m_tile[5];
-    end_5 = end_5 < policy.m_upper[5] ? end_5 : policy.m_upper[5];
-
-    const ptrdiff_t begin_6 = offset[6];
-    ptrdiff_t end_6         = begin_6 + policy.m_tile[6];
-    end_6 = end_6 < policy.m_upper[6] ? end_6 : policy.m_upper[6];
-
-    const ptrdiff_t begin_7 = offset[7];
-    ptrdiff_t end_7         = begin_7 + policy.m_tile[7];
-    end_7 = end_7 < policy.m_upper[7] ? end_7 : policy.m_upper[7];
-
-#pragma omp for collapse(8)
-    for (ptrdiff_t i0 = begin_0; i0 < end_0; i0++)
-      for (ptrdiff_t i1 = begin_1; i1 < end_1; i1++)
-        for (ptrdiff_t i2 = begin_2; i2 < end_2; i2++)
-          for (ptrdiff_t i3 = begin_3; i3 < end_3; i3++)
-            for (ptrdiff_t i4 = begin_4; i4 < end_4; i4++)
-              for (ptrdiff_t i5 = begin_5; i5 < end_5; i5++)
-                for (ptrdiff_t i6 = begin_6; i6 < end_6; i6++)
-                  for (ptrdiff_t i7 = begin_7; i7 < end_7; i7++) {
-                    if constexpr (std::is_same<typename Policy::work_tag,
-                                               void>::value)
-                      functor(i0, i1, i2, i3, i4, i5, i6, i7);
-                    else
-                      functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
-                              i5, i6, i7);
-                  }
-#endif
-  }
-
   inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy)
       : m_functor(arg_functor), m_policy(arg_policy) {}
   // TODO DZP: based on a conversation with Christian, we're using 256 as a
@@ -652,112 +438,6 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 namespace Kokkos {
 namespace Impl {
 
-template <class FunctorType, class ReducerType, class PointerType,
-          class ValueType, class... PolicyArgs>
-struct ParallelReduceSpecialize<FunctorType,
-                                Kokkos::MDRangePolicy<PolicyArgs...>,
-                                ReducerType, PointerType, ValueType, 0, 0> {
-  using PolicyType = Kokkos::RangePolicy<PolicyArgs...>;
-  template <class TagType>
-  inline static
-      typename std::enable_if<std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(i, result);
-
-    *result_ptr = result;
-  }
-
-  template <class TagType>
-  inline static
-      typename std::enable_if<!std::is_same<TagType, void>::value>::type
-      execute_impl(const FunctorType& f, const PolicyType& p,
-                   PointerType result_ptr) {
-    OpenMPTargetExec::verify_is_process(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    OpenMPTargetExec::verify_initialized(
-        "Kokkos::Experimental::OpenMPTarget parallel_for");
-    const typename PolicyType::member_type begin = p.begin();
-    const typename PolicyType::member_type end   = p.end();
-
-    ValueType result = ValueType();
-#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
-    for (int i = begin; i < end; i++) f(TagType(), i, result);
-
-    *result_ptr = result;
-  }
-
-  inline static void execute(const FunctorType& f, const PolicyType& p,
-                             PointerType ptr) {
-    execute_impl<typename PolicyType::work_tag>(f, p, ptr);
-  }
-};
-/*
-template<class FunctorType, class PolicyType, class ReducerType, class
-PointerType, class ValueType> struct ParallelReduceSpecialize<FunctorType,
-PolicyType, ReducerType, PointerType, ValueType, 0,1> {
-
-  #pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out,
-omp_in)) initializer ( ReducerType::init(omp_priv) )
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom:result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(i,result);
-
-      *result_ptr=result;
-    }
-
-
-  template< class TagType >
-  inline static
-  typename std::enable_if< ! std::is_same< TagType , void >::value >::type
-  execute_impl(const FunctorType& f, const PolicyType& p, PointerType
-result_ptr)
-    {
-      OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget
-parallel_for");
-      OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget
-parallel_for"); const typename PolicyType::member_type begin = p.begin(); const
-typename PolicyType::member_type end = p.end();
-
-      ValueType result = ValueType();
-      #pragma omp target teams distribute parallel for num_teams(512) map(to:f)
-map(tofrom: result) reduction(custom: result) for(int i=begin; i<end; i++)
-        f(TagType(),i,result);
-
-      *result_ptr=result;
-    }
-
-
-    inline static
-    void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
-      execute_impl<typename PolicyType::work_tag>(f,p,ptr);
-    }
-};
-
-
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::OpenMPTarget> {
@@ -765,42 +445,38 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   using Policy = Kokkos::MDRangePolicy<Traits...>;
 
   using WorkTag = typename Policy::work_tag;
-  using WorkRange = typename Policy::WorkRange;
-  using Member = typename Policy::member_type;
+  using Member  = typename Policy::member_type;
 
   using ReducerConditional =
-      Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                         FunctorType, ReducerType>;
+      std::conditional<std::is_same<InvalidType, ReducerType>::value,
+                       FunctorType, ReducerType>;
   using ReducerTypeFwd = typename ReducerConditional::type;
   using WorkTagFwd =
-      typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
-                                               WorkTag, void>::type;
-
-  // Static Assert WorkTag void if ReducerType not InvalidType
+      std::conditional_t<std::is_same<InvalidType, ReducerType>::value, WorkTag,
+                         void>;
 
   using ValueTraits =
       Kokkos::Impl::FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>;
-  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
-  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
+
+  using pointer_type   = typename ValueTraits::pointer_type;
+  using reference_type = typename ValueTraits::reference_type;
 
   enum { HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
   enum { UseReducer = is_reducer_type<ReducerType>::value };
 
-  using pointer_type = typename ValueTraits::pointer_type;
-  using reference_type = typename ValueTraits::reference_type;
-
-  using ParForSpecialize = ParallelReduceSpecialize<
-      FunctorType, Policy, ReducerType, pointer_type,
-      typename ValueTraits::value_type, HasJoin, UseReducer>;
-
+  const pointer_type m_result_ptr;
   const FunctorType m_functor;
   const Policy m_policy;
   const ReducerType m_reducer;
-  const pointer_type m_result_ptr;
+
+  using ParReduceCommon = ParallelReduceCommon<pointer_type>;
+
+  bool m_result_ptr_on_device;
 
  public:
   inline void execute() const {
-    ParForSpecialize::execute(m_functor, m_policy, m_result_ptr);
+    execute_tile<Policy::rank, typename ValueTraits::value_type>(
+        m_functor, m_policy, m_result_ptr);
   }
 
   template <class ViewType>
@@ -810,35 +486,345 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
       typename std::enable_if<Kokkos::is_view<ViewType>::value &&
                                   !Kokkos::is_reducer_type<ReducerType>::value,
                               void*>::type = NULL)
-      : m_functor(arg_functor),
+      : m_result_ptr(arg_result_view.data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(InvalidType()),
-        m_result_ptr(arg_result_view.data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
-  }
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ViewType::memory_space>::accessible) {}
 
   inline ParallelReduce(const FunctorType& arg_functor, Policy arg_policy,
                         const ReducerType& reducer)
-      : m_functor(arg_functor),
+      : m_result_ptr(reducer.view().data()),
+        m_functor(arg_functor),
         m_policy(arg_policy),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {
-    //static_assert( std::is_same< typename ViewType::memory_space
-    //                                , Kokkos::HostSpace >::value
-    //  , "Reduction result on Kokkos::Experimental::OpenMPTarget must be a
-    //  Kokkos::View in HostSpace" );
+        m_result_ptr_on_device(
+            MemorySpaceAccess<Kokkos::Experimental::OpenMPTargetSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 2>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(2) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+            functor(i0, i1, result);
+          else
+            functor(typename Policy::work_tag(), i0, i1, result);
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
   }
-  // TODO DZP: based on a conversation with Christian, we're using 256 as a
-heuristic
-  // here. We need something better once we can query these kinds of properties
-  template<typename Policy, typename Functor>
-static int max_tile_size_product(const Policy&, const Functor&) {
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 3>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(3) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            if constexpr (std::is_same<typename Policy::work_tag, void>::value)
+              functor(i0, i1, i2, result);
+            else
+              functor(typename Policy::work_tag(), i0, i1, i2, result);
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 4>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[3];
+    const auto begin_3 = policy.m_lower[2];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(4) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              if constexpr (std::is_same<typename Policy::work_tag,
+                                         void>::value)
+                functor(i0, i1, i2, i3, result);
+              else
+                functor(typename Policy::work_tag(), i0, i1, i2, i3, result);
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 5>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(5) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                if constexpr (std::is_same<typename Policy::work_tag,
+                                           void>::value)
+                  functor(i0, i1, i2, i3, i4, result);
+                else
+                  functor(typename Policy::work_tag(), i0, i1, i2, i3, i4,
+                          result);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <int Rank, class ValueType>
+  inline typename std::enable_if<Rank == 6>::type execute_tile(
+      const FunctorType& functor, const Policy& policy,
+      pointer_type ptr) const {
+    const auto begin_0 = policy.m_lower[0];
+    const auto begin_1 = policy.m_lower[1];
+    const auto begin_2 = policy.m_lower[2];
+    const auto begin_3 = policy.m_lower[3];
+    const auto begin_4 = policy.m_lower[4];
+    const auto begin_5 = policy.m_lower[5];
+
+    const auto end_0 = policy.m_upper[0];
+    const auto end_1 = policy.m_upper[1];
+    const auto end_2 = policy.m_upper[2];
+    const auto end_3 = policy.m_upper[3];
+    const auto end_4 = policy.m_upper[4];
+    const auto end_5 = policy.m_upper[5];
+
+    ValueType result = ValueType();
+
+    // FIXME_OPENMPTARGET: Unable to separate directives and their companion
+    // loops which leads to code duplication for different reduction types.
+    if constexpr (UseReducer) {
+#pragma omp declare reduction(                                         \
+    custom:ValueType                                                   \
+    : OpenMPTargetReducerWrapper <ReducerType>::join(omp_out, omp_in)) \
+    initializer(OpenMPTargetReducerWrapper <ReducerType>::init(omp_priv))
+
+#pragma omp target teams distribute parallel for collapse(6) map(to         \
+                                                                 : functor) \
+    reduction(custom                                                        \
+              : result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    } else {
+#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \
+reduction(+:result)
+      for (auto i0 = begin_0; i0 < end_0; ++i0) {
+        for (auto i1 = begin_1; i1 < end_1; ++i1) {
+          for (auto i2 = begin_2; i2 < end_2; ++i2) {
+            for (auto i3 = begin_3; i3 < end_3; ++i3) {
+              for (auto i4 = begin_4; i4 < end_4; ++i4) {
+                for (auto i5 = begin_5; i5 < end_5; ++i5) {
+                  if constexpr (std::is_same<typename Policy::work_tag,
+                                             void>::value)
+                    functor(i0, i1, i2, i3, i4, i5, result);
+                  else
+                    functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5,
+                            result);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    ParReduceCommon::memcpy_result(ptr, &result, sizeof(ValueType),
+                                   m_result_ptr_on_device);
+  }
+
+  template <typename Policy, typename Functor>
+  static int max_tile_size_product(const Policy&, const Functor&) {
     return 256;
   }
-};*/
+};
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
index be924ffa61..0e71a239ca 100644
--- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
+++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
@@ -112,35 +112,11 @@ void TaskExec<Kokkos::Experimental::OpenMPTarget>::team_barrier_impl() const {
   // This team member sets one byte within the sync variable
   int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank;
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   *sync_self = int8_t(m_sync_value & 0x03);  // signal arrival
 
   while (m_sync_value != *sync)
     ;  // wait for team to arrive
 
-#if 0
-fprintf( stdout
-       , "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
-       , m_group_rank
-       , m_team_rank
-       , m_sync_step
-       , m_sync_value
-       , *sync
-       );
-fflush(stdout);
-#endif
-
   ++m_sync_step;
 
   if (0 == (0x01 & m_sync_step)) {  // Every other step
@@ -222,17 +198,6 @@ void TaskQueueSpecialization<Kokkos::Experimental::OpenMPTarget>::execute(
         task = *task_shared;
       }
 
-#if 0
-fprintf( stdout
-       , "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
-       , team_exec.m_group_rank
-       , team_exec.m_team_rank
-       , uintptr_t(task_shared)
-       , uintptr_t(task)
-       );
-fflush(stdout);
-#endif
-
       if (0 == task) break;  // 0 == m_ready_count
 
       if (end == task) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
index 3a09ee9195..18d33317a2 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp
@@ -112,14 +112,36 @@ void SYCL::print_configuration(std::ostream& s, const bool detailed) {
 }
 
 void SYCL::fence() const {
-  Impl::SYCLInternal::fence(*m_space_instance->m_queue);
+  fence("Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::fence(const std::string& name) const {
+  Impl::SYCLInternal::fence(*m_space_instance->m_queue, name,
+                            impl_instance_id());
 }
 
 void SYCL::impl_static_fence() {
-  // guard accessing all_queues
-  std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
-  for (auto& queue : Impl::SYCLInternal::all_queues)
-    Impl::SYCLInternal::fence(**queue);
+  impl_static_fence(
+      "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence");
+}
+void SYCL::impl_static_fence(const std::string& name) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name,
+      Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+          GlobalDeviceSynchronization,
+      [&]() {
+        // guard accessing all_queues
+        std::lock_guard<std::mutex> lock(Impl::SYCLInternal::mutex);
+        for (auto& queue : Impl::SYCLInternal::all_queues) {
+          try {
+            (*queue)->wait_and_throw();
+          } catch (sycl::exception const& e) {
+            Kokkos::Impl::throw_runtime_exception(
+                std::string("There was a synchronous SYCL error:\n") +=
+                e.what());
+          }
+        }
+      });
 }
 
 int SYCL::sycl_device() const {
@@ -224,10 +246,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os,
             << device.get_info<device::global_mem_cache_size>()
             << "\nGlobal Mem Size: "
             << device.get_info<device::global_mem_size>()
-            << "\nMax Constant Buffer Size: "
-            << device.get_info<device::max_constant_buffer_size>()
-            << "\nMax Constant Args: "
-            << device.get_info<device::max_constant_args>()
             << "\nLocal Mem Size: " << device.get_info<device::local_mem_size>()
             << "\nError Correction Support: "
             << device.get_info<device::error_correction_support>()
@@ -296,6 +314,9 @@ void SYCLSpaceInitializer::finalize(const bool all_spaces) {
 void SYCLSpaceInitializer::fence() {
   Kokkos::Experimental::SYCL::impl_static_fence();
 }
+void SYCLSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Experimental::SYCL::impl_static_fence(name);
+}
 
 void SYCLSpaceInitializer::print_configuration(std::ostream& msg,
                                                const bool detail) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
index aef65ee7ec..3eeab56363 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp
@@ -48,181 +48,144 @@
 #include <Kokkos_Core_fwd.hpp>
 #include <Kokkos_SYCL.hpp>
 
+#include <vector>
+
 #ifdef KOKKOS_ENABLE_SYCL
 
 namespace Kokkos {
 namespace Impl {
 
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                Kokkos::Experimental::SYCL> {
-  DeepCopy(void* dst, const void* src, size_t);
-  DeepCopy(const Kokkos::Experimental::SYCL&, void* dst, const void* src,
-           size_t);
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                Kokkos::Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
+template <class DT, class... DP>
+struct ZeroMemset<Kokkos::Experimental::SYCL, DT, DP...> {
+  ZeroMemset(const Kokkos::Experimental::SYCL& exec_space,
+             const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    auto event = exec_space.impl_internal_space_instance()->m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
+    exec_space.impl_internal_space_instance()->m_queue->submit_barrier(
+        std::vector<sycl::event>{event});
   }
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+  ZeroMemset(const View<DT, DP...>& dst,
+             typename View<DT, DP...>::const_value_type&) {
+    Experimental::Impl::SYCLInternal::singleton().m_queue->memset(
+        dst.data(), 0,
+        dst.size() * sizeof(typename View<DT, DP...>::value_type));
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
+void DeepCopySYCL(void* dst, const void* src, size_t n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n);
 
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<MemSpace, HostSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <class ExecutionSpace>
-struct DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                ExecutionSpace> {
-  DeepCopy(void* dst, const void* src, size_t n) {
-    (void)DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-                   Kokkos::Experimental::SYCL>(dst, src, n);
-  }
-
-  DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) {
-    exec.fence();
-    DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-             Kokkos::Experimental::SYCL>(Kokkos::Experimental::SYCL(), dst, src,
-                                         n);
-    Kokkos::Experimental::SYCL().fence();
+template <class MemSpace>
+struct DeepCopy<HostSpace, MemSpace, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
   }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2>
+struct DeepCopy<MemSpace1, MemSpace2, Kokkos::Experimental::SYCL,
+                std::enable_if_t<is_sycl_type_space<MemSpace1>::value &&
+                                 is_sycl_type_space<MemSpace2>::value>> {
+  DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); }
+  DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
+           const void* src, size_t n) {
+    DeepCopyAsyncSYCL(instance, dst, src, n);
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace1, class MemSpace2, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace1, MemSpace2, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace1>::value &&
+        is_sycl_type_space<MemSpace2>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
+
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace1::name() + "Space, " +
+        MemSpace2::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace,
-                Kokkos::Experimental::SYCL>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    MemSpace, HostSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
+
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
+
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<") + MemSpace::name() +
+        "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
-template <>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+template <class MemSpace, class ExecutionSpace>
+struct DeepCopy<
+    HostSpace, MemSpace, ExecutionSpace,
+    std::enable_if_t<
+        is_sycl_type_space<MemSpace>::value &&
+        !std::is_same<ExecutionSpace, Kokkos::Experimental::SYCL>::value>> {
+  inline DeepCopy(void* dst, const void* src, size_t n) {
+    DeepCopySYCL(dst, src, n);
+  }
 
-template <>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, Kokkos::Experimental::SYCL>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace,
-                      Kokkos::Experimental::SYCL> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace,
-                 Kokkos::Experimental::SYCL>::DeepCopy;
-};
+  inline DeepCopy(const ExecutionSpace& exec, void* dst, const void* src,
+                  size_t n) {
+    exec.fence(fence_string());
+    DeepCopyAsyncSYCL(dst, src, n);
+  }
 
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLDeviceUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace,
-                Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                      Experimental::SYCLDeviceUSMSpace, ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace,
-                 Experimental::SYCLDeviceUSMSpace, ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<Experimental::SYCLSharedUSMSpace, HostSpace, ExecutionSpace>
-    : public DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                      ExecutionSpace> {
-  using DeepCopy<Experimental::SYCLDeviceUSMSpace, HostSpace,
-                 ExecutionSpace>::DeepCopy;
-};
-
-template <class ExecutionSpace>
-struct DeepCopy<HostSpace, Experimental::SYCLSharedUSMSpace, ExecutionSpace>
-    : public DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                      ExecutionSpace> {
-  using DeepCopy<HostSpace, Experimental::SYCLDeviceUSMSpace,
-                 ExecutionSpace>::DeepCopy;
+ private:
+  static const std::string& fence_string() {
+    static const std::string string =
+        std::string("Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
+        "Space, ExecutionSpace>::DeepCopy: fence before copy";
+    return string;
+  }
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
index 5a702b5027..31c5bc449a 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp
@@ -42,14 +42,7 @@
 //@HEADER
 */
 
-#include <Kokkos_Concepts.hpp>
-#include <SYCL/Kokkos_SYCL_Instance.hpp>
-#include <KokkosCore_Config_DeclareBackend.hpp>
-#include <Kokkos_SYCL.hpp>
-#include <Kokkos_HostSpace.hpp>
-#include <Kokkos_Serial.hpp>
-#include <impl/Kokkos_ConcurrentBitset.hpp>
-#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Core.hpp>  //kokkos_malloc
 
 namespace Kokkos {
 namespace Experimental {
@@ -95,7 +88,11 @@ void SYCLInternal::initialize(const sycl::device& d) {
       Kokkos::Impl::throw_runtime_exception(
           "There was an asynchronous SYCL error!\n");
   };
-  initialize(sycl::queue{d, exception_handler});
+  // FIXME_SYCL using an in-order queue here should not be necessary since we
+  // are using submit_barrier for managing kernel dependencies but this seems to
+  // be required as a hot fix for now.
+  initialize(
+      sycl::queue{d, exception_handler, sycl::property::queue::in_order()});
 }
 
 // FIXME_SYCL
@@ -122,7 +119,6 @@ void SYCLInternal::initialize(const sycl::queue& q) {
       all_queues.push_back(&m_queue);
     }
     const sycl::device& d = m_queue->get_device();
-    std::cout << SYCL::SYCLDevice(d) << '\n';
 
     m_maxWorkgroupSize =
         d.template get_info<sycl::info::device::max_work_group_size>();
@@ -140,19 +136,22 @@ void SYCLInternal::initialize(const sycl::queue& q) {
           Kokkos::Experimental::SYCLDeviceUSMSpace, void>;
       Record* const r =
           Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                           "Kokkos::SYCL::InternalScratchBitset",
+                           "Kokkos::Experimental::SYCL::InternalScratchBitset",
                            sizeof(uint32_t) * buffer_bound);
       Record::increment(r);
       m_scratchConcurrentBitset = reinterpret_cast<uint32_t*>(r->data());
       auto event                = m_queue->memset(m_scratchConcurrentBitset, 0,
                                    sizeof(uint32_t) * buffer_bound);
-      fence(event);
+      fence(event,
+            "Kokkos::Experimental::SYCLInternal::initialize: fence after "
+            "initializing m_scratchConcurrentBitset",
+            m_instance_id);
     }
 
     m_maxShmemPerBlock =
         d.template get_info<sycl::info::device::local_mem_size>();
-    m_indirectKernelMem.reset(*m_queue);
-    m_indirectReducerMem.reset(*m_queue);
+    m_indirectKernelMem.reset(*m_queue, m_instance_id);
+    m_indirectReducerMem.reset(*m_queue, m_instance_id);
   } else {
     std::ostringstream msg;
     msg << "Kokkos::Experimental::SYCL::initialize(...) FAILED";
@@ -162,10 +161,36 @@ void SYCLInternal::initialize(const sycl::queue& q) {
     }
     Kokkos::Impl::throw_runtime_exception(msg.str());
   }
+
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
 }
 
+void* SYCLInternal::resize_team_scratch_space(std::int64_t bytes,
+                                              bool force_shrink) {
+  if (m_team_scratch_current_size == 0) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_malloc<Experimental::SYCLDeviceUSMSpace>(
+            "Kokkos::Experimental::SYCLDeviceUSMSpace::TeamScratchMemory",
+            m_team_scratch_current_size);
+  }
+  if ((bytes > m_team_scratch_current_size) ||
+      ((bytes < m_team_scratch_current_size) && (force_shrink))) {
+    m_team_scratch_current_size = bytes;
+    m_team_scratch_ptr =
+        Kokkos::kokkos_realloc<Experimental::SYCLDeviceUSMSpace>(
+            m_team_scratch_ptr, m_team_scratch_current_size);
+  }
+  return m_team_scratch_ptr;
+}
+
+uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; }
+
 void SYCLInternal::finalize() {
-  SYCL().fence();
+  SYCLInternal::fence(*m_queue,
+                      "Kokkos::SYCLInternal::finalize: fence on finalization",
+                      m_instance_id);
   was_finalized = true;
 
   using RecordSYCL = Kokkos::Impl::SharedAllocationRecord<SYCLDeviceUSMSpace>;
@@ -182,6 +207,12 @@ void SYCLInternal::finalize() {
   RecordSYCL::decrement(RecordSYCL::get_record(m_scratchConcurrentBitset));
   m_scratchConcurrentBitset = nullptr;
 
+  if (m_team_scratch_current_size > 0)
+    Kokkos::kokkos_free<Kokkos::Experimental::SYCLDeviceUSMSpace>(
+        m_team_scratch_ptr);
+  m_team_scratch_current_size = 0;
+  m_team_scratch_ptr          = nullptr;
+
   m_indirectKernelMem.reset();
   m_indirectReducerMem.reset();
   // guard erasing from all_queues
@@ -208,7 +239,7 @@ void* SYCLInternal::scratch_space(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchSpace",
+                         "Kokkos::Experimental::SYCL::InternalScratchSpace",
                          (sizeScratchGrain * m_scratchSpaceCount));
 
     Record::increment(r);
@@ -235,7 +266,7 @@ void* SYCLInternal::scratch_flags(
 
     Record* const r =
         Record::allocate(Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue),
-                         "Kokkos::SYCL::InternalScratchFlags",
+                         "Kokkos::Experimental::SYCL::InternalScratchFlags",
                          (sizeScratchGrain * m_scratchFlagsCount));
 
     Record::increment(r);
@@ -243,14 +274,38 @@ void* SYCLInternal::scratch_flags(
     m_scratchFlags = reinterpret_cast<size_type*>(r->data());
   }
   m_queue->memset(m_scratchFlags, 0, m_scratchFlagsCount * sizeScratchGrain);
-  fence(*m_queue);
+  fence(*m_queue,
+        "Kokkos::Experimental::SYCLInternal::scratch_flags fence after "
+        "initializing m_scratchFlags",
+        m_instance_id);
 
   return m_scratchFlags;
 }
 
+template <typename WAT>
+void SYCLInternal::fence_helper(WAT& wat, const std::string& name,
+                                uint32_t instance_id) {
+  Kokkos::Tools::Experimental::Impl::profile_fence_event<
+      Kokkos::Experimental::SYCL>(
+      name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id},
+      [&]() {
+        try {
+          wat.wait_and_throw();
+        } catch (sycl::exception const& e) {
+          Kokkos::Impl::throw_runtime_exception(
+              std::string("There was a synchronous SYCL error:\n") += e.what());
+        }
+      });
+}
+template void SYCLInternal::fence_helper<sycl::queue>(sycl::queue&,
+                                                      const std::string&,
+                                                      uint32_t);
+template void SYCLInternal::fence_helper<sycl::event>(sycl::event&,
+                                                      const std::string&,
+                                                      uint32_t);
+
 template <sycl::usm::alloc Kind>
 size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
-  assert(m_size == 0);
   assert(m_q);
 
   if (m_capacity < n) {
@@ -258,8 +313,8 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
     // First free what we have (in case malloc can reuse it)
     if (m_data) Record::decrement(Record::get_record(m_data));
 
-    Record* const r = Record::allocate(AllocationSpace(*m_q),
-                                       "Kokkos::SYCL::USMObjectMem", n);
+    Record* const r = Record::allocate(
+        AllocationSpace(*m_q), "Kokkos::Experimental::SYCL::USMObjectMem", n);
     Record::increment(r);
 
     m_data     = r->data();
@@ -271,9 +326,9 @@ size_t SYCLInternal::USMObjectMem<Kind>::reserve(size_t n) {
 
 template <sycl::usm::alloc Kind>
 void SYCLInternal::USMObjectMem<Kind>::reset() {
-  assert(m_size == 0);
-
   if (m_data) {
+    // This implies a fence since this class is not copyable
+    // and deallocating implies a fence across all registered queues.
     using Record = Kokkos::Impl::SharedAllocationRecord<AllocationSpace, void>;
     Record::decrement(Record::get_record(m_data));
 
@@ -285,6 +340,7 @@ void SYCLInternal::USMObjectMem<Kind>::reset() {
 
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::shared>;
 template class SYCLInternal::USMObjectMem<sycl::usm::alloc::device>;
+template class SYCLInternal::USMObjectMem<sycl::usm::alloc::host>;
 
 }  // namespace Impl
 }  // namespace Experimental
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
index e797411cd4..bf4d6c5b45 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp
@@ -49,7 +49,7 @@
 #include <CL/sycl.hpp>
 
 #include <impl/Kokkos_Error.hpp>
-
+#include <impl/Kokkos_Profiling.hpp>
 namespace Kokkos {
 namespace Experimental {
 namespace Impl {
@@ -68,7 +68,10 @@ class SYCLInternal {
 
   void* scratch_space(const size_type size);
   void* scratch_flags(const size_type size);
+  void* resize_team_scratch_space(std::int64_t bytes,
+                                  bool force_shrink = false);
 
+  uint32_t impl_get_instance_id() const;
   int m_syclDev = -1;
 
   size_t m_maxWorkgroupSize   = 0;
@@ -81,6 +84,11 @@ class SYCLInternal {
   size_type m_scratchFlagsCount       = 0;
   size_type* m_scratchFlags           = nullptr;
 
+  int64_t m_team_scratch_current_size = 0;
+  void* m_team_scratch_ptr            = nullptr;
+
+  uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance<
+      Kokkos::Experimental::SYCL>(reinterpret_cast<uintptr_t>(this));
   std::optional<sycl::queue> m_queue;
 
   // Using std::vector<std::optional<sycl::queue>> reveals a compiler bug when
@@ -94,40 +102,16 @@ class SYCLInternal {
   template <sycl::usm::alloc Kind>
   class USMObjectMem {
    public:
-    class Deleter {
-     public:
-      Deleter() = default;
-      explicit Deleter(USMObjectMem* mem) : m_mem(mem) {}
-
-      template <typename T>
-      void operator()(T* p) const noexcept {
-        assert(m_mem);
-        assert(sizeof(T) == m_mem->size());
-
-        if constexpr (sycl::usm::alloc::device == kind)
-          // Only skipping the dtor on trivially copyable types
-          static_assert(std::is_trivially_copyable_v<T>);
-        else
-          p->~T();
-
-        m_mem->m_size = 0;
-      }
-
-     private:
-      USMObjectMem* m_mem = nullptr;
-    };
-
-    static constexpr sycl::usm::alloc kind = Kind;
-
     void reset();
 
-    void reset(sycl::queue q) {
+    void reset(sycl::queue q, uint32_t instance_id) {
+      m_instance_id = instance_id;
       reset();
       m_q.emplace(std::move(q));
     }
-
     USMObjectMem() = default;
-    explicit USMObjectMem(sycl::queue q) noexcept : m_q(std::move(q)) {}
+    explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept
+        : m_q(std::move(q)), m_instance_id(instance_id) {}
 
     USMObjectMem(USMObjectMem const&) = delete;
     USMObjectMem(USMObjectMem&&)      = delete;
@@ -139,7 +123,6 @@ class SYCLInternal {
     void* data() noexcept { return m_data; }
     const void* data() const noexcept { return m_data; }
 
-    size_t size() const noexcept { return m_size; }
     size_t capacity() const noexcept { return m_capacity; }
 
     // reserve() allocates space for at least n bytes
@@ -147,120 +130,68 @@ class SYCLInternal {
     size_t reserve(size_t n);
 
    private:
-    using AllocationSpace =
-        std::conditional_t<Kind == sycl::usm::alloc::device,
-                           Kokkos::Experimental::SYCLDeviceUSMSpace,
-                           Kokkos::Experimental::SYCLSharedUSMSpace>;
-
-    // This will memcpy an object T into memory held by this object
-    // returns: a T* to that object
-    //
-    // Note:  it is UB to dereference this pointer with an object that is
-    // not an implicit-lifetime nor trivially-copyable type, but presumably much
-    // faster because we can use USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> memcpy_from(const T& t) {
-      reserve(sizeof(T));
-      sycl::event memcopied = m_q->memcpy(m_data, std::addressof(t), sizeof(T));
-      fence(memcopied);
-
-      std::unique_ptr<T, Deleter> ptr(reinterpret_cast<T*>(m_data),
-                                      Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
-
-    // This will copy-constuct an object T into memory held by this object
-    // returns: a unique_ptr<T, destruct_delete> that will call the
-    // destructor on the type when it goes out of scope.
-    //
-    // Note:  This will not work with USM device memory
-    template <typename T>
-    std::unique_ptr<T, Deleter> copy_construct_from(const T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot copy construct into USM device memory");
-
-      reserve(sizeof(T));
-
-      std::unique_ptr<T, Deleter> ptr(new (m_data) T(t), Deleter(this));
-      m_size = sizeof(T);
-      return ptr;
-    }
+    using AllocationSpace = std::conditional_t<
+        Kind == sycl::usm::alloc::device,
+        Kokkos::Experimental::SYCLDeviceUSMSpace,
+        std::conditional_t<Kind == sycl::usm::alloc::shared,
+                           Kokkos::Experimental::SYCLSharedUSMSpace,
+                           Kokkos::Experimental::SYCLHostUSMSpace>>;
 
    public:
-    // Performs either memcpy (for USM device memory) and returns a T*
-    // (but is technically UB when dereferenced on an object that is not
-    // an implicit-lifetime nor trivially-copyable type
-    //
-    // or
-    //
-    // performs copy construction (for other USM memory types) and returns a
-    // unique_ptr<T, ...>
+    // Performs either sycl::memcpy (for USM device memory) or std::memcpy
+    // (otherwise) and returns a reference to the copied object.
     template <typename T>
-    std::unique_ptr<T, Deleter> copy_from(const T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_from(t);
-      else
-        return copy_construct_from(t);
+    T& copy_from(const T& t) {
+      fence();
+      reserve(sizeof(T));
+      if constexpr (sycl::usm::alloc::device == Kind) {
+        sycl::event memcopied =
+            m_q->memcpy(m_data, std::addressof(t), sizeof(T));
+        SYCLInternal::fence(
+            memcopied,
+            "Kokkos::Experimental::SYCLInternal::USMObject fence after copy",
+            m_instance_id);
+      } else
+        std::memcpy(m_data, std::addressof(t), sizeof(T));
+      return *reinterpret_cast<T*>(m_data);
     }
 
-   private:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& memcpy_to(T& t) {
-      assert(sizeof(T) == m_size);
-
-      sycl::event memcopied = m_q->memcpy(std::addressof(t), m_data, sizeof(T));
-      fence(memcopied);
-
-      return t;
+    void fence() {
+      SYCLInternal::fence(
+          m_last_event,
+          "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for "
+          "last event to finish",
+          m_instance_id);
     }
 
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& move_assign_to(T& t) {
-      static_assert(kind != sycl::usm::alloc::device,
-                    "Cannot move_assign_to from USM device memory");
-
-      assert(sizeof(T) == m_size);
-
-      t = std::move(*static_cast<T*>(m_data));
-
-      return t;
-    }
-
-   public:
-    // Returns a reference to t (helpful when debugging)
-    template <typename T>
-    T& transfer_to(T& t) {
-      if constexpr (sycl::usm::alloc::device == kind)
-        return memcpy_to(t);
-      else
-        return move_assign_to(t);
+    void register_event(sycl::event event) {
+      assert(m_last_event
+                 .get_info<sycl::info::event::command_execution_status>() ==
+             sycl::info::event_command_status::complete);
+      m_last_event = event;
     }
 
    private:
     // USMObjectMem class invariants
     // All four expressions below must evaluate to true:
     //
-    //  !m_data == !m_capacity
-    //  m_q || !m_data
-    //  m_data || !m_size
-    //  m_size <= m_capacity
+    //  !m_data == (m_capacity == 0)
+    //      m_q || !m_data
     //
     //  The above invariants mean that:
-    //  if m_size != 0 then m_data != 0
-    //  if m_data != 0 then m_capacity != 0 && m_q != nullopt
-    //  if m_data == 0 then m_capacity == 0
+    //  if m_data != nullptr then m_capacity != 0 && m_q != nullopt
+    //  if m_data == nullptr then m_capacity == 0
 
     std::optional<sycl::queue> m_q;
     void* m_data      = nullptr;
-    size_t m_size     = 0;  // sizeof(T) iff m_data points to live T
     size_t m_capacity = 0;
+    sycl::event m_last_event;
+
+    uint32_t m_instance_id;
   };
 
   // An indirect kernel is one where the functor to be executed is explicitly
-  // copied to USM device memory before being executed, to get around the
+  // copied to USM memory before being executed, to get around the
   // trivially copyable limitation of SYCL.
   using IndirectKernelMem = USMObjectMem<sycl::usm::alloc::shared>;
   IndirectKernelMem m_indirectKernelMem;
@@ -286,18 +217,18 @@ class SYCLInternal {
   // fence(...) takes any type with a .wait_and_throw() method
   // (sycl::event and sycl::queue)
   template <typename WAT>
-  static void fence_helper(WAT& wat) {
-    try {
-      wat.wait_and_throw();
-    } catch (sycl::exception const& e) {
-      Kokkos::Impl::throw_runtime_exception(
-          std::string("There was a synchronous SYCL error:\n") += e.what());
-    }
-  }
+  static void fence_helper(WAT& wat, const std::string& name,
+                           uint32_t instance_id);
 
  public:
-  static void fence(sycl::queue& q) { fence_helper(q); }
-  static void fence(sycl::event& e) { fence_helper(e); }
+  static void fence(sycl::queue& q, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(q, name, instance_id);
+  }
+  static void fence(sycl::event& e, const std::string& name,
+                    uint32_t instance_id) {
+    fence_helper(e, name, instance_id);
+  }
 };
 
 template <typename Functor, typename Storage,
@@ -312,20 +243,24 @@ class SYCLFunctionWrapper<Functor, Storage, true> {
   SYCLFunctionWrapper(const Functor& functor, Storage&) : m_functor(functor) {}
 
   const Functor& get_functor() const { return m_functor; }
+
+  static void register_event(Storage&, sycl::event){};
 };
 
 template <typename Functor, typename Storage>
 class SYCLFunctionWrapper<Functor, Storage, false> {
-  std::unique_ptr<Functor,
-                  Experimental::Impl::SYCLInternal::IndirectKernelMem::Deleter>
-      m_kernelFunctorPtr;
+  const Functor& m_kernelFunctor;
 
  public:
   SYCLFunctionWrapper(const Functor& functor, Storage& storage)
-      : m_kernelFunctorPtr(storage.copy_from(functor)) {}
+      : m_kernelFunctor(storage.copy_from(functor)) {}
 
   std::reference_wrapper<const Functor> get_functor() const {
-    return {*m_kernelFunctorPtr};
+    return {m_kernelFunctor};
+  }
+
+  static void register_event(Storage& storage, sycl::event event) {
+    storage.register_event(event);
   }
 };
 
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
index a286169c45..dca73683c3 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Range.hpp
@@ -47,11 +47,13 @@
 
 #include <impl/KokkosExp_IterateTileGPU.hpp>
 
-template <class FunctorType, class ExecPolicy>
-class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
+#include <vector>
+
+template <class FunctorType, class... Traits>
+class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
                                 Kokkos::Experimental::SYCL> {
  public:
-  using Policy = ExecPolicy;
+  using Policy = Kokkos::RangePolicy<Traits...>;
 
  private:
   using Member       = typename Policy::member_type;
@@ -62,16 +64,15 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
   const Policy m_policy;
 
   template <typename Functor>
-  static void sycl_direct_launch(const Policy& policy, const Functor& functor) {
+  static sycl::event sycl_direct_launch(const Policy& policy,
+                                        const Functor& functor) {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    space.fence();
-
-    q.submit([functor, policy](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([functor, policy](sycl::handler& cgh) {
       sycl::range<1> range(policy.end() - policy.begin());
       const auto begin = policy.begin();
 
@@ -83,8 +84,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
           functor(WorkTag(), id);
       });
     });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -100,7 +102,9 @@ class Kokkos::Impl::ParallelFor<FunctorType, ExecPolicy,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
@@ -201,41 +205,48 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    m_space.fence();
-
-    if (m_policy.m_num_tiles == 0) return;
+    if (m_policy.m_num_tiles == 0) return {};
 
     const BarePolicy bare_policy(m_policy);
 
-    q.submit([functor, this, bare_policy](sycl::handler& cgh) {
-      const auto range = compute_ranges();
+    auto parallel_for_event =
+        q.submit([functor, this, bare_policy](sycl::handler& cgh) {
+          const auto range                  = compute_ranges();
+          const sycl::range<3> global_range = range.get_global_range();
+          const sycl::range<3> local_range  = range.get_local_range();
+          const sycl::nd_range sycl_swapped_range{
+              sycl::range<3>{global_range[2], global_range[1], global_range[0]},
+              sycl::range<3>{local_range[2], local_range[1], local_range[0]}};
 
-      cgh.parallel_for(range, [functor, bare_policy](sycl::nd_item<3> item) {
-        const index_type local_x    = item.get_local_id(0);
-        const index_type local_y    = item.get_local_id(1);
-        const index_type local_z    = item.get_local_id(2);
-        const index_type global_x   = item.get_group(0);
-        const index_type global_y   = item.get_group(1);
-        const index_type global_z   = item.get_group(2);
-        const index_type n_global_x = item.get_group_range(0);
-        const index_type n_global_y = item.get_group_range(1);
-        const index_type n_global_z = item.get_group_range(2);
+          cgh.parallel_for(sycl_swapped_range, [functor, bare_policy](
+                                                   sycl::nd_item<3> item) {
+            // swap back for correct index calculations in DeviceIterateTile
+            const index_type local_x    = item.get_local_id(2);
+            const index_type local_y    = item.get_local_id(1);
+            const index_type local_z    = item.get_local_id(0);
+            const index_type global_x   = item.get_group(2);
+            const index_type global_y   = item.get_group(1);
+            const index_type global_z   = item.get_group(0);
+            const index_type n_global_x = item.get_group_range(2);
+            const index_type n_global_y = item.get_group_range(1);
+            const index_type n_global_z = item.get_group_range(0);
 
-        Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
-                                        typename Policy::work_tag>(
-            bare_policy, functor, {n_global_x, n_global_y, n_global_z},
-            {global_x, global_y, global_z}, {local_x, local_y, local_z})
-            .exec_range();
-      });
-    });
+            Kokkos::Impl::DeviceIterateTile<Policy::rank, BarePolicy, Functor,
+                                            typename Policy::work_tag>(
+                bare_policy, functor, {n_global_x, n_global_y, n_global_z},
+                {global_x, global_y, global_z}, {local_x, local_y, local_z})
+                .exec_range();
+          });
+        });
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
 
-    m_space.fence();
+    return parallel_for_event;
   }
 
  public:
@@ -253,7 +264,8 @@ class Kokkos::Impl::ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(const ParallelFor&) = delete;
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
index 03b7753f8e..75237b4c72 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Reduce.hpp
@@ -46,14 +46,99 @@
 #define KOKKOS_SYCL_PARALLEL_REDUCE_HPP
 
 #include <Kokkos_Macros.hpp>
+
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
+#include <Kokkos_Parallel_Reduce.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
 namespace Kokkos {
+
 namespace Impl {
 
+namespace SYCLReduction {
+template <class ValueJoin, class ValueOps, typename WorkTag, typename ValueType,
+          typename ReducerType, typename FunctorType, int dim>
+void workgroup_reduction(sycl::nd_item<dim>& item,
+                         sycl::local_ptr<ValueType> local_mem,
+                         ValueType* results_ptr,
+                         ValueType* device_accessible_result_ptr,
+                         const unsigned int value_count,
+                         const ReducerType& selected_reducer,
+                         const FunctorType& functor, bool final) {
+  const auto local_id = item.get_local_linear_id();
+  // FIXME_SYCL should be item.get_group().get_local_linear_range();
+  size_t wgroup_size = 1;
+  for (unsigned int i = 0; i < dim; ++i) wgroup_size *= item.get_local_range(i);
+
+  // Perform the actual workgroup reduction in each subgroup
+  // separately.
+  auto sg                = item.get_sub_group();
+  auto* result           = &local_mem[local_id * value_count];
+  const auto id_in_sg    = sg.get_local_id()[0];
+  const auto local_range = std::min(sg.get_local_range()[0], wgroup_size);
+  for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+    if (id_in_sg + stride < local_range)
+      ValueJoin::join(selected_reducer, result,
+                      &local_mem[(local_id + stride) * value_count]);
+    sg.barrier();
+  }
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Copy the subgroup results into the first positions of the
+  // reduction array.
+  if (id_in_sg == 0)
+    ValueOps::copy(functor, &local_mem[sg.get_group_id()[0] * value_count],
+                   result);
+  item.barrier(sycl::access::fence_space::local_space);
+
+  // Do the final reduction only using the first subgroup.
+  if (sg.get_group_id()[0] == 0) {
+    const auto n_subgroups = sg.get_group_range()[0];
+    auto* result_          = &local_mem[id_in_sg * value_count];
+    // In case the number of subgroups is larger than the range of
+    // the first subgroup, we first combine the items with a higher
+    // index.
+    for (unsigned int offset = local_range; offset < n_subgroups;
+         offset += local_range)
+      if (id_in_sg + offset < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + offset) * value_count]);
+    sg.barrier();
+
+    // Then, we proceed as before.
+    for (unsigned int stride = 1; stride < local_range; stride <<= 1) {
+      if (id_in_sg + stride < n_subgroups)
+        ValueJoin::join(selected_reducer, result_,
+                        &local_mem[(id_in_sg + stride) * value_count]);
+      sg.barrier();
+    }
+
+    // Finally, we copy the workgroup results back to global memory
+    // to be used in the next iteration. If this is the last
+    // iteration, i.e., there is only one workgroup also call
+    // final() if necessary.
+    if (id_in_sg == 0) {
+      if (final) {
+        if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
+          FunctorFinal<FunctorType, WorkTag>::final(functor, &local_mem[0]);
+        if (device_accessible_result_ptr != nullptr)
+          ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                         &local_mem[0]);
+        else
+          ValueOps::copy(functor, &results_ptr[0], &local_mem[0]);
+      } else
+        ValueOps::copy(functor,
+                       &results_ptr[(item.get_group_linear_id()) * value_count],
+                       &local_mem[0]);
+    }
+  }
+}
+
+}  // namespace SYCLReduction
+
 template <class FunctorType, class ReducerType, class... Traits>
 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
                      Kokkos::Experimental::SYCL> {
@@ -76,19 +161,29 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
       : m_functor(f),
         m_policy(p),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -121,18 +216,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         const auto begin = policy.begin();
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
@@ -149,9 +244,13 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr != nullptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -163,7 +262,7 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
       auto n_wgroups = ((size + values_per_thread - 1) / values_per_thread +
                         wgroup_size - 1) /
                        wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -217,49 +316,15 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
-
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1)
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor), n_wgroups <= 1);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -268,13 +333,17 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence due to "
+          "inaccessible reducer result location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -291,15 +360,18 @@ class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  Policy m_policy;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const FunctorType m_functor;
+  const Policy m_policy;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 template <class FunctorType, class ReducerType, class... Traits>
@@ -347,7 +419,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
   ParallelReduce(
       const FunctorType& f, const Policy& p, const V& v,
       typename std::enable_if<Kokkos::is_view<V>::value, void*>::type = nullptr)
-      : m_functor(f), m_policy(p), m_space(p.space()), m_result_ptr(v.data()) {}
+      : m_functor(f),
+        m_policy(p),
+        m_space(p.space()),
+        m_result_ptr(v.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename V::memory_space>::accessible) {}
 
   ParallelReduce(const FunctorType& f, const Policy& p,
                  const ReducerType& reducer)
@@ -355,12 +433,17 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         m_policy(p),
         m_space(p.space()),
         m_reducer(reducer),
-        m_result_ptr(reducer.view().data()) {}
+        m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible) {}
 
  private:
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -379,8 +462,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
         *m_space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    const int nwork = m_policy.m_num_tiles;
-    const int block_size =
+    const typename Policy::index_type nwork = m_policy.m_num_tiles;
+    const typename Policy::index_type block_size =
         std::pow(2, std::ceil(std::log2(m_policy.m_prod_tile_dims)));
 
     const sycl::range<1> local_range(block_size);
@@ -402,12 +485,16 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // FIXME_SYCL without this we are running into a race condition
     const auto results_ptr2 =
         results_ptr + std::max(value_count, 1u) * init_size;
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         cgh.single_task([=]() {
           const auto& selected_reducer = ReducerConditional::select(
               static_cast<const FunctorType&>(functor),
@@ -424,9 +511,13 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
             FunctorFinal<FunctorType, WorkTag>::final(
                 static_cast<const FunctorType&>(functor), results_ptr);
+          if (device_accessible_result_ptr)
+            ValueOps::copy(functor, &device_accessible_result_ptr[0],
+                           &results_ptr[0]);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -435,8 +526,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -498,47 +589,21 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
           }
           item.barrier(sycl::access::fence_space::local_space);
 
-          // Perform the actual workgroup reduction. To achieve a better
-          // memory access pattern, we use sequential addressing and a
-          // reversed loop. If the workgroup size is 8, the first element
-          // contains all the values with index%4==0, after the second one
-          // the values with index%2==0 and after the third one index%1==0,
-          // i.e., all values.
-          for (unsigned int stride = wgroup_size / 2; stride > 0;
-               stride >>= 1) {
-            const auto idx = local_id;
-            if (idx < stride) {
-              ValueJoin::join(selected_reducer, &local_mem[idx * value_count],
-                              &local_mem[(idx + stride) * value_count]);
-            }
-            item.barrier(sycl::access::fence_space::local_space);
-          }
-
-          // Finally, we copy the workgroup results back to global memory to
-          // be used in the next iteration. If this is the last iteration,
-          // i.e., there is only one workgroup also call final() if
-          // necessary.
-          if (local_id == 0) {
-            ValueOps::copy(
-                functor,
-                &results_ptr2[(item.get_group_linear_id()) * value_count],
-                &local_mem[0]);
-            if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-              if (n_wgroups <= 1)
-                FunctorFinal<FunctorType, WorkTag>::final(
-                    static_cast<const FunctorType&>(functor),
-                    &results_ptr2[(item.get_group_linear_id()) * value_count]);
-          }
+          SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+              item, local_mem.get_pointer(), results_ptr2,
+              device_accessible_result_ptr, value_count, selected_reducer,
+              static_cast<const FunctorType&>(functor),
+              n_wgroups <= 1 && item.get_group_linear_id() == 0);
         });
       });
-      m_space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
 
       // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          m_space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      m_space.fence();
+      auto deep_copy_event =
+          q.memcpy(results_ptr, results_ptr2,
+                   sizeof(*m_result_ptr) * value_count * n_wgroups);
+      q.submit_barrier(std::vector<sycl::event>{deep_copy_event});
+      last_reduction_event = deep_copy_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -547,19 +612,23 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           m_space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      m_space.fence();
+      m_space.fence(
+          "Kokkos::Impl::ParallelReduce::sycl_direct_launch: fence after deep "
+          "copying results back");
     }
+
+    return last_reduction_event;
   }
 
  public:
   template <typename Policy, typename Functor>
   static int max_tile_size_product(const Policy& policy, const Functor&) {
-    return policy.space().impl_internal_space_instance()->m_maxThreadsPerSM;
+    return policy.space().impl_internal_space_instance()->m_maxWorkgroupSize;
   }
 
   void execute() const {
@@ -575,16 +644,19 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
-  FunctorType m_functor;
-  BarePolicy m_policy;
+  const FunctorType m_functor;
+  const BarePolicy m_policy;
   const Kokkos::Experimental::SYCL& m_space;
-  ReducerType m_reducer;
-  pointer_type m_result_ptr;
+  const ReducerType m_reducer;
+  const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
 };
 
 }  // namespace Impl
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
index 5eac6bf9da..d5611c2159 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Scan.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <memory>
+#include <vector>
 #if defined(KOKKOS_ENABLE_SYCL)
 
 namespace Kokkos {
@@ -86,96 +87,99 @@ class ParallelScanSYCLBase {
   void scan_internal(sycl::queue& q, const Functor& functor,
                      pointer_type global_mem, std::size_t size) const {
     // FIXME_SYCL optimize
-    constexpr size_t wgroup_size = 32;
+    constexpr size_t wgroup_size = 128;
     auto n_wgroups               = (size + wgroup_size - 1) / wgroup_size;
+    pointer_type group_results   = global_mem + n_wgroups * wgroup_size;
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> group_results_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * n_wgroups,
-                                               q, sycl::usm::alloc::shared)),
-        deleter);
-    auto group_results = group_results_memory.get();
-
-    q.submit([&](sycl::handler& cgh) {
+    auto local_scans = q.submit([&](sycl::handler& cgh) {
       sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                      sycl::access::target::local>
           local_mem(sycl::range<1>(wgroup_size), cgh);
 
-      // FIXME_SYCL we get wrong results without this, not sure why
-      sycl::stream out(1, 1, cgh);
       cgh.parallel_for(
           sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
           [=](sycl::nd_item<1> item) {
-            const auto local_id  = item.get_local_linear_id();
-            const auto global_id = item.get_global_linear_id();
+            const auto local_id      = item.get_local_linear_id();
+            const auto global_id     = item.get_global_linear_id();
+            const auto global_offset = global_id - local_id;
 
             // Initialize local memory
             if (global_id < size)
-              ValueOps::copy(functor, &local_mem[local_id],
-                             &global_mem[global_id]);
+              local_mem[local_id] = global_mem[global_id];
             else
               ValueInit::init(functor, &local_mem[local_id]);
             item.barrier(sycl::access::fence_space::local_space);
 
-            // Perform workgroup reduction
-            for (size_t stride = 1; 2 * stride < wgroup_size + 1; stride *= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size)
-                ValueJoin::join(functor, &local_mem[idx],
-                                &local_mem[idx - stride]);
-              item.barrier(sycl::access::fence_space::local_space);
+            // subgroup scans
+            auto sg                = item.get_sub_group();
+            const auto sg_group_id = sg.get_group_id()[0];
+            const int id_in_sg     = sg.get_local_id()[0];
+            for (int stride = wgroup_size / 2; stride > 0; stride >>= 1) {
+              auto tmp = sg.shuffle_up(local_mem[local_id], stride);
+              if (id_in_sg >= stride)
+                ValueJoin::join(functor, &local_mem[local_id], &tmp);
             }
 
-            if (local_id == 0) {
-              if (n_wgroups > 1)
-                ValueOps::copy(functor,
-                               &group_results[item.get_group_linear_id()],
-                               &local_mem[wgroup_size - 1]);
-              else
-                ValueInit::init(functor,
-                                &group_results[item.get_group_linear_id()]);
-              ValueInit::init(functor, &local_mem[wgroup_size - 1]);
-            }
+            const int local_range = sg.get_local_range()[0];
+            if (id_in_sg == local_range - 1)
+              global_mem[sg_group_id + global_offset] = local_mem[local_id];
+            local_mem[local_id] = sg.shuffle_up(local_mem[local_id], 1);
+            if (id_in_sg == 0) ValueInit::init(functor, &local_mem[local_id]);
+            item.barrier(sycl::access::fence_space::local_space);
 
-            // Add results to all items
-            for (size_t stride = wgroup_size / 2; stride > 0; stride /= 2) {
-              auto idx = 2 * stride * (local_id + 1) - 1;
-              if (idx < wgroup_size) {
-                value_type dummy;
-                ValueOps::copy(functor, &dummy, &local_mem[idx - stride]);
-                ValueOps::copy(functor, &local_mem[idx - stride],
-                               &local_mem[idx]);
-                ValueJoin::join(functor, &local_mem[idx], &dummy);
+            // scan subgroup results using the first subgroup
+            if (sg_group_id == 0) {
+              const int n_subgroups = sg.get_group_range()[0];
+              if (local_range < n_subgroups) Kokkos::abort("Not implemented!");
+
+              for (int stride = n_subgroups / 2; stride > 0; stride >>= 1) {
+                auto tmp =
+                    sg.shuffle_up(global_mem[id_in_sg + global_offset], stride);
+                if (id_in_sg >= stride) {
+                  if (id_in_sg < n_subgroups)
+                    ValueJoin::join(
+                        functor, &global_mem[id_in_sg + global_offset], &tmp);
+                  else
+                    global_mem[id_in_sg + global_offset] = tmp;
+                }
               }
-              item.barrier(sycl::access::fence_space::local_space);
             }
+            item.barrier(sycl::access::fence_space::local_space);
+
+            // add results to all subgroups
+            if (sg_group_id > 0)
+              ValueJoin::join(functor, &local_mem[local_id],
+                              &global_mem[sg_group_id - 1 + global_offset]);
+            item.barrier(sycl::access::fence_space::local_space);
+            if (n_wgroups > 1 && local_id == wgroup_size - 1)
+              group_results[item.get_group_linear_id()] =
+                  global_mem[sg_group_id + global_offset];
+            item.barrier(sycl::access::fence_space::local_space);
 
             // Write results to global memory
-            if (global_id < size)
-              ValueOps::copy(functor, &global_mem[global_id],
-                             &local_mem[local_id]);
+            if (global_id < size) global_mem[global_id] = local_mem[local_id];
           });
     });
+    q.submit_barrier(std::vector<sycl::event>{local_scans});
 
-    if (n_wgroups > 1) scan_internal(q, functor, group_results, n_wgroups);
-    m_policy.space().fence();
-
-    q.submit([&](sycl::handler& cgh) {
-      cgh.parallel_for(sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
-                       [=](sycl::nd_item<1> item) {
-                         const auto global_id = item.get_global_linear_id();
-                         if (global_id < size)
-                           ValueJoin::join(
-                               functor, &global_mem[global_id],
-                               &group_results[item.get_group_linear_id()]);
-                       });
-    });
-    m_policy.space().fence();
+    if (n_wgroups > 1) {
+      scan_internal(q, functor, group_results, n_wgroups);
+      auto update_with_group_results = q.submit([&](sycl::handler& cgh) {
+        cgh.parallel_for(
+            sycl::nd_range<1>(n_wgroups * wgroup_size, wgroup_size),
+            [=](sycl::nd_item<1> item) {
+              const auto global_id = item.get_global_linear_id();
+              if (global_id < size)
+                ValueJoin::join(functor, &global_mem[global_id],
+                                &group_results[item.get_group_linear_id()]);
+            });
+      });
+      q.submit_barrier(std::vector<sycl::event>{update_with_group_results});
+    }
   }
 
   template <typename Functor>
-  void sycl_direct_launch(const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = m_policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
@@ -185,7 +189,7 @@ class ParallelScanSYCLBase {
     const std::size_t len = m_policy.end() - m_policy.begin();
 
     // Initialize global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto initialize_global_memory = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       auto begin      = m_policy.begin();
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
@@ -197,29 +201,30 @@ class ParallelScanSYCLBase {
           functor(id, update, false);
         else
           functor(WorkTag(), id, update, false);
-        ValueOps::copy(functor, &global_mem[id], &update);
+        global_mem[id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{initialize_global_memory});
 
-    // Perform the actual exlcusive scan
+    // Perform the actual exclusive scan
     scan_internal(q, functor, m_scratch_space, len);
 
     // Write results to global memory
-    q.submit([&](sycl::handler& cgh) {
+    auto update_global_results = q.submit([&](sycl::handler& cgh) {
       auto global_mem = m_scratch_space;
       cgh.parallel_for(sycl::range<1>(len), [=](sycl::item<1> item) {
-        auto global_id = item.get_id();
+        auto global_id = item.get_id(0);
 
         value_type update = global_mem[global_id];
         if constexpr (std::is_same<WorkTag, void>::value)
           functor(global_id, update, true);
         else
           functor(WorkTag(), global_id, update, true);
-        ValueOps::copy(functor, &global_mem[global_id], &update);
+        global_mem[global_id] = update;
       });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{update_global_results});
+    return update_global_results;
   }
 
  public:
@@ -227,28 +232,39 @@ class ParallelScanSYCLBase {
   void impl_execute(const PostFunctor& post_functor) {
     if (m_policy.begin() == m_policy.end()) return;
 
-    const auto& q = *m_policy.space().impl_internal_space_instance()->m_queue;
+    auto& instance        = *m_policy.space().impl_internal_space_instance();
     const std::size_t len = m_policy.end() - m_policy.begin();
 
-    // FIXME_SYCL The allocation should be handled by the execution space
-    // consider only storing one value per block and recreate initial results in
-    // the end before doing the final pass
-    auto deleter = [&q](value_type* ptr) { sycl::free(ptr, q); };
-    std::unique_ptr<value_type[], decltype(deleter)> result_memory(
-        static_cast<pointer_type>(sycl::malloc(sizeof(value_type) * len, q,
-                                               sycl::usm::alloc::shared)),
-        deleter);
-    m_scratch_space = result_memory.get();
+    // Compute the total amount of memory we will need. We emulate the recursive
+    // structure that is used to do the actual scan. Essentially, we need to
+    // allocate memory for the whole range and then recursively for the reduced
+    // group results until only one group is left.
+    std::size_t total_memory = 0;
+    {
+      size_t wgroup_size   = 128;
+      size_t n_nested_size = len;
+      size_t n_nested_wgroups;
+      do {
+        n_nested_wgroups = (n_nested_size + wgroup_size - 1) / wgroup_size;
+        n_nested_size    = n_nested_wgroups;
+        total_memory += sizeof(value_type) * n_nested_wgroups * wgroup_size;
+      } while (n_nested_wgroups > 1);
+      total_memory += sizeof(value_type) * wgroup_size;
+    }
+
+    // FIXME_SYCL consider only storing one value per block and recreate initial
+    // results in the end before doing the final pass
+    m_scratch_space =
+        static_cast<pointer_type>(instance.scratch_space(total_memory));
 
     Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem&
-        indirectKernelMem = m_policy.space()
-                                .impl_internal_space_instance()
-                                ->m_indirectKernelMem;
+        indirectKernelMem = instance.m_indirectKernelMem;
 
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(functor_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
     post_functor();
   }
 
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
index 738620926b..9538bf7080 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Parallel_Team.hpp
@@ -47,8 +47,11 @@
 
 #include <Kokkos_Parallel.hpp>
 
+#include <SYCL/Kokkos_SYCL_Parallel_Reduce.hpp>  // workgroup_reduction
 #include <SYCL/Kokkos_SYCL_Team.hpp>
 
+#include <vector>
+
 namespace Kokkos {
 namespace Impl {
 template <typename... Properties>
@@ -63,8 +66,6 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   friend class TeamPolicyInternal;
 
  private:
-  static int constexpr MAX_WARP = 8;
-
   typename traits::execution_space m_space;
   int m_league_size;
   int m_team_size;
@@ -128,11 +129,18 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   }
   inline bool impl_auto_vector_length() const { return m_tune_vector_length; }
   inline bool impl_auto_team_size() const { return m_tune_team_size; }
+  // FIXME_SYCL This is correct in most cases, but not necessarily in case a
+  // custom sycl::queue is used to initialize the execution space.
   static int vector_length_max() {
-    // FIXME_SYCL provide a reasonable value
-    return 1;
+    std::vector<size_t> sub_group_sizes =
+        execution_space{}
+            .impl_internal_space_instance()
+            ->m_queue->get_device()
+            .template get_info<sycl::info::device::sub_group_sizes>();
+    return *std::max_element(sub_group_sizes.begin(), sub_group_sizes.end());
   }
 
+ private:
   static int verify_requested_vector_length(int requested_vector_length) {
     int test_vector_length =
         std::min(requested_vector_length, vector_length_max());
@@ -140,18 +148,14 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
     // Allow only power-of-two vector_length
     if (!(is_integral_power_of_two(test_vector_length))) {
       int test_pow2 = 1;
-      for (int i = 0; i < 5; i++) {
-        test_pow2 = test_pow2 << 1;
-        if (test_pow2 > test_vector_length) {
-          break;
-        }
-      }
+      while (test_pow2 < test_vector_length) test_pow2 <<= 1;
       test_vector_length = test_pow2 >> 1;
     }
 
     return test_vector_length;
   }
 
+ public:
   static int scratch_size_max(int level) {
     return level == 0 ? 1024 * 32
                       :           // FIXME_SYCL arbitrarily setting this to 32kB
@@ -160,7 +164,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
   inline void impl_set_vector_length(size_t size) { m_vector_length = size; }
   inline void impl_set_team_size(size_t size) { m_team_size = size; }
   int impl_vector_length() const { return m_vector_length; }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
   KOKKOS_DEPRECATED int vector_length() const { return impl_vector_length(); }
+#endif
 
   int team_size() const { return m_team_size; }
 
@@ -206,7 +212,21 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         m_chunk_size(0),
         m_tune_team_size(bool(team_size_request <= 0)),
         m_tune_vector_length(bool(vector_length_request <= 0)) {
-    // FIXME_SYCL check paramters
+    // FIXME_SYCL Check that league size is permissible,
+    // https://github.com/intel/llvm/pull/4064
+
+    // Make sure total block size is permissible
+    if (m_team_size * m_vector_length >
+        static_cast<int>(
+            m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) {
+      Impl::throw_runtime_exception(
+          std::string("Kokkos::TeamPolicy<SYCL> the team size is too large. "
+                      "Team size x vector length is " +
+                      std::to_string(m_team_size * m_vector_length) +
+                      " but must be smaller than ") +
+          std::to_string(
+              m_space.impl_internal_space_instance()->m_maxWorkgroupSize));
+    }
   }
 
   /** \brief  Specify league size, request team size */
@@ -311,8 +331,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
          2 * sizeof(double) - m_team_scratch_size[0]) /
         (sizeof(double) + m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -335,8 +356,9 @@ class TeamPolicyInternal<Kokkos::Experimental::SYCL, Properties...>
         (sizeof(double) + sizeof(value_type) * value_count +
          m_thread_scratch_size[0]);
     return std::min<int>(
-        m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
-        max_threads_for_memory);
+               m_space.impl_internal_space_instance()->m_maxWorkgroupSize,
+               max_threads_for_memory) /
+           impl_vector_length();
   }
 
   template <class FunctorType>
@@ -376,14 +398,15 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   int m_scratch_size[2];
 
   template <typename Functor>
-  void sycl_direct_launch(const Policy& policy, const Functor& functor) const {
+  sycl::event sycl_direct_launch(const Policy& policy,
+                                 const Functor& functor) const {
     // Convenience references
     const Kokkos::Experimental::SYCL& space = policy.space();
     Kokkos::Experimental::Impl::SYCLInternal& instance =
         *space.impl_internal_space_instance();
     sycl::queue& q = *instance.m_queue;
 
-    q.submit([&](sycl::handler& cgh) {
+    auto parallel_for_event = q.submit([&](sycl::handler& cgh) {
       // FIXME_SYCL accessors seem to need a size greater than zero at least for
       // host queues
       sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -399,14 +422,22 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
       cgh.parallel_for(
           sycl::nd_range<2>(
-              sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+              sycl::range<2>(m_team_size, m_league_size * m_vector_size),
               sycl::range<2>(m_team_size, m_vector_size)),
           [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+            if (item.get_sub_group().get_local_range() %
+                    item.get_local_range(1) !=
+                0)
+              Kokkos::abort(
+                  "The sub_group size is not divisible by the vector_size. "
+                  "Choose a smaller vector_size!");
+#endif
             const member_type team_member(
                 team_scratch_memory_L0.get_pointer(), shmem_begin,
                 scratch_size[0],
                 static_cast<char*>(scratch_ptr[1]) +
-                    item.get_group(0) * scratch_size[1],
+                    item.get_group(1) * scratch_size[1],
                 scratch_size[1], item);
             if constexpr (std::is_same<work_tag, void>::value)
               functor(team_member);
@@ -414,7 +445,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
               functor(work_tag(), team_member);
           });
     });
-    space.fence();
+    q.submit_barrier(std::vector<sycl::event>{parallel_for_event});
+    return parallel_for_event;
   }
 
  public:
@@ -429,7 +461,9 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_functor, indirectKernelMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    sycl::event event =
+        sycl_direct_launch(m_policy, functor_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
   }
 
   ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy)
@@ -451,11 +485,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -463,27 +496,17 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
+    const auto max_team_size =
+        m_policy.team_size_max(arg_functor, ParallelForTag{});
     if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{}))
       Kokkos::Impl::throw_runtime_exception(
-          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size.");
-  }
-
-  // FIXME_SYCL remove when managing m_scratch_ptr[1] in the execution space
-  // instance
-  ParallelFor(const ParallelFor&) = delete;
-  ParallelFor& operator=(const ParallelFor&) = delete;
-
-  ~ParallelFor() {
-    const Kokkos::Experimental::SYCL& space = m_policy.space();
-    Kokkos::Experimental::Impl::SYCLInternal& instance =
-        *space.impl_internal_space_instance();
-    sycl::queue& q = *instance.m_queue;
-    sycl::free(m_scratch_ptr[1], q);
+          "Kokkos::Impl::ParallelFor<SYCL> requested too large team size. The "
+          "maximal team_size is " +
+          std::to_string(max_team_size) + '!');
   }
 };
 
@@ -516,6 +539,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
+  const bool m_result_ptr_device_accessible;
   // FIXME_SYCL avoid reallocating memory for reductions
   /*  size_type* m_scratch_space;
     size_type* m_scratch_flags;
@@ -529,8 +553,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   const size_type m_vector_size;
 
   template <typename PolicyType, typename Functor, typename Reducer>
-  void sycl_direct_launch(const PolicyType& policy, const Functor& functor,
-                          const Reducer& reducer) const {
+  sycl::event sycl_direct_launch(const PolicyType& policy,
+                                 const Functor& functor,
+                                 const Reducer& reducer) const {
     using ReducerConditional =
         Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
                            FunctorType, ReducerType>;
@@ -553,25 +578,25 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     sycl::queue& q = *instance.m_queue;
 
     // FIXME_SYCL optimize
-    const size_t wgroup_size = m_team_size;
-    std::size_t size         = m_league_size * m_team_size;
+    const size_t wgroup_size = m_team_size * m_vector_size;
+    std::size_t size         = m_league_size * m_team_size * m_vector_size;
     const auto init_size =
         std::max<std::size_t>((size + wgroup_size - 1) / wgroup_size, 1);
     const unsigned int value_count =
         FunctorValueTraits<ReducerTypeFwd, WorkTagFwd>::value_count(
             selected_reducer);
-    // FIXME_SYCL only use the first half
     const auto results_ptr = static_cast<pointer_type>(instance.scratch_space(
-        sizeof(value_type) * std::max(value_count, 1u) * init_size * 2));
-    // FIXME_SYCL without this we are running into a race condition
-    const auto results_ptr2 =
-        results_ptr + std::max(value_count, 1u) * init_size;
+        sizeof(value_type) * std::max(value_count, 1u) * init_size));
+    value_type* device_accessible_result_ptr =
+        m_result_ptr_device_accessible ? m_result_ptr : nullptr;
+
+    sycl::event last_reduction_event;
 
     // If size<=1 we only call init(), the functor and possibly final once
     // working with the global scratch memory but don't copy back to
     // m_result_ptr yet.
     if (size <= 1) {
-      q.submit([&](sycl::handler& cgh) {
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         // FIXME_SYCL accessors seem to need a size greater than zero at least
         // for host queues
         sycl::accessor<char, 1, sycl::access::mode::read_write,
@@ -606,9 +631,13 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
                 FunctorFinal<FunctorType, WorkTag>::final(
                     static_cast<const FunctorType&>(functor), results_ptr);
+              if (device_accessible_result_ptr)
+                ValueOps::copy(functor, device_accessible_result_ptr,
+                               &results_ptr[0]);
             });
       });
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
     }
 
     // Otherwise, we perform a reduction on the values in all workgroups
@@ -617,8 +646,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // value.
     bool first_run = true;
     while (size > 1) {
-      auto n_wgroups = (size + wgroup_size - 1) / wgroup_size;
-      q.submit([&](sycl::handler& cgh) {
+      auto n_wgroups             = (size + wgroup_size - 1) / wgroup_size;
+      auto parallel_reduce_event = q.submit([&](sycl::handler& cgh) {
         sycl::accessor<value_type, 1, sycl::access::mode::read_write,
                        sycl::access::target::local>
             local_mem(sycl::range<1>(wgroup_size) * std::max(value_count, 1u),
@@ -638,9 +667,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
 
         cgh.parallel_for(
             sycl::nd_range<2>(
-                sycl::range<2>(m_league_size * m_team_size, m_vector_size),
+                sycl::range<2>(m_team_size, m_league_size * m_vector_size),
                 sycl::range<2>(m_team_size, m_vector_size)),
             [=](sycl::nd_item<2> item) {
+#ifdef KOKKOS_ENABLE_DEBUG
+              if (first_run && item.get_sub_group().get_local_range() %
+                                       item.get_local_range(1) !=
+                                   0)
+                Kokkos::abort(
+                    "The sub_group size is not divisible by the vector_size. "
+                    "Choose a smaller vector_size!");
+#endif
               const auto local_id = item.get_local_linear_id();
               const auto global_id =
                   wgroup_size * item.get_group_linear_id() + local_id;
@@ -651,9 +688,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               // In the first iteration, we call functor to initialize the local
               // memory. Otherwise, the local memory is initialized with the
               // results from the previous iteration that are stored in global
-              // memory. Note that we load values_per_thread values per thread
-              // and immediately combine them to avoid too many threads being
-              // idle in the actual workgroup reduction.
+              // memory.
               if (first_run) {
                 reference_type update = ValueInit::init(
                     selected_reducer, &local_mem[local_id * value_count]);
@@ -661,7 +696,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
                     team_scratch_memory_L0.get_pointer(), shmem_begin,
                     scratch_size[0],
                     static_cast<char*>(scratch_ptr[1]) +
-                        item.get_group(0) * scratch_size[1],
+                        item.get_group(1) * scratch_size[1],
                     scratch_size[1], item);
                 if constexpr (std::is_same<WorkTag, void>::value)
                   functor(team_member, update);
@@ -678,50 +713,18 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
               }
               item.barrier(sycl::access::fence_space::local_space);
 
-              // Perform the actual workgroup reduction. To achieve a better
-              // memory access pattern, we use sequential addressing and a
-              // reversed loop. If the workgroup size is 8, the first element
-              // contains all the values with index%4==0, after the second one
-              // the values with index%2==0 and after the third one index%1==0,
-              // i.e., all values.
-              for (unsigned int stride = wgroup_size / 2; stride > 0;
-                   stride >>= 1) {
-                const auto idx = local_id;
-                if (idx < stride) {
-                  ValueJoin::join(selected_reducer,
-                                  &local_mem[idx * value_count],
-                                  &local_mem[(idx + stride) * value_count]);
-                }
-                item.barrier(sycl::access::fence_space::local_space);
-              }
+              SYCLReduction::workgroup_reduction<ValueJoin, ValueOps, WorkTag>(
+                  item, local_mem.get_pointer(), results_ptr,
+                  device_accessible_result_ptr, value_count, selected_reducer,
+                  static_cast<const FunctorType&>(functor),
+                  n_wgroups <= 1 && item.get_group_linear_id() == 0);
 
-              // Finally, we copy the workgroup results back to global memory to
-              // be used in the next iteration. If this is the last iteration,
-              // i.e., there is only one workgroup also call final() if
-              // necessary.
-              if (local_id == 0) {
-                ValueOps::copy(
-                    functor,
-                    &results_ptr2[(item.get_group_linear_id()) * value_count],
-                    &local_mem[0]);
-                if constexpr (ReduceFunctorHasFinal<FunctorType>::value)
-                  if (n_wgroups <= 1 && item.get_group_linear_id() == 0) {
-                    FunctorFinal<FunctorType, WorkTag>::final(
-                        static_cast<const FunctorType&>(functor),
-                        &results_ptr2[(item.get_group_linear_id()) *
-                                      value_count]);
-                  }
-              }
+              // FIXME_SYCL not quite sure why this is necessary
+              item.barrier(sycl::access::fence_space::global_space);
             });
       });
-      space.fence();
-
-      // FIXME_SYCL this is likely not necessary, see above
-      Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-                             Kokkos::Experimental::SYCLDeviceUSMSpace>(
-          space, results_ptr, results_ptr2,
-          sizeof(*m_result_ptr) * value_count * n_wgroups);
-      space.fence();
+      q.submit_barrier(std::vector<sycl::event>{parallel_reduce_event});
+      last_reduction_event = parallel_reduce_event;
 
       first_run = false;
       size      = n_wgroups;
@@ -730,13 +733,17 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // At this point, the reduced value is written to the entry in results_ptr
     // and all that is left is to copy it back to the given result pointer if
     // necessary.
-    if (m_result_ptr) {
+    if (m_result_ptr && !m_result_ptr_device_accessible) {
       Kokkos::Impl::DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
                              Kokkos::Experimental::SYCLDeviceUSMSpace>(
           space, m_result_ptr, results_ptr,
           sizeof(*m_result_ptr) * value_count);
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ParallelReduce<TeamPolicy,SYCL>: fence because "
+          "reduction can't access result storage location");
     }
+
+    return last_reduction_event;
   }
 
  public:
@@ -753,8 +760,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     const auto reducer_wrapper = Experimental::Impl::make_sycl_function_wrapper(
         m_reducer, indirectReducerMem);
 
-    sycl_direct_launch(m_policy, functor_wrapper.get_functor(),
-                       reducer_wrapper.get_functor());
+    sycl::event event = sycl_direct_launch(
+        m_policy, functor_wrapper.get_functor(), reducer_wrapper.get_functor());
+    functor_wrapper.register_event(indirectKernelMem, event);
+    reducer_wrapper.register_event(indirectReducerMem, event);
   }
 
  private:
@@ -779,11 +788,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // FIXME_SYCL so far accessors used instead of these pointers
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
-    const auto& space    = *m_policy.space().impl_internal_space_instance();
-    const sycl::queue& q = *space.m_queue;
-    m_scratch_ptr[0]     = nullptr;
-    m_scratch_ptr[1]     = sycl::malloc_device(
-        sizeof(char) * m_scratch_size[1] * m_league_size, q);
+    auto& space      = *m_policy.space().impl_internal_space_instance();
+    m_scratch_ptr[0] = nullptr;
+    m_scratch_ptr[1] = space.resize_team_scratch_space(
+        static_cast<ptrdiff_t>(m_scratch_size[1]) * m_league_size);
 
     if (static_cast<int>(space.m_maxShmemPerBlock) <
         m_shmem_size - m_shmem_begin) {
@@ -791,8 +799,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
       out << "Kokkos::Impl::ParallelFor<SYCL> insufficient shared memory! "
              "Requested "
           << m_shmem_size - m_shmem_begin << " bytes but maximum is "
-          << m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock
-          << '\n';
+          << space.m_maxShmemPerBlock << '\n';
       Kokkos::Impl::throw_runtime_exception(out.str());
     }
 
@@ -811,6 +818,9 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result.data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ViewType::memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
@@ -823,6 +833,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
         m_policy(arg_policy),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()),
+        m_result_ptr_device_accessible(
+            MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                              typename ReducerType::result_view_type::
+                                  memory_space>::accessible),
         m_league_size(arg_policy.league_size()),
         m_team_size(arg_policy.team_size()),
         m_vector_size(arg_policy.impl_vector_length()) {
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
index 75741438e2..6ec6204e71 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp
@@ -56,64 +56,22 @@
 /*--------------------------------------------------------------------------*/
 namespace Kokkos {
 namespace Impl {
-namespace {
-auto USM_memcpy(sycl::queue& q, void* dst, const void* src, size_t n) {
-  return q.memcpy(dst, src, n);
+
+void DeepCopySYCL(void* dst, const void* src, size_t n) {
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence before memcpy");
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence("Kokkos::Impl::DeepCopySYCL: fence after memcpy");
 }
 
-void USM_memcpy(Kokkos::Experimental::Impl::SYCLInternal& space, void* dst,
-                const void* src, size_t n) {
-  (void)USM_memcpy(*space.m_queue, dst, src, n);
+void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst,
+                       const void* src, size_t n) {
+  instance.impl_internal_space_instance()->m_queue->memcpy(dst, src, n);
 }
 
-void USM_memcpy(void* dst, const void* src, size_t n) {
-  Experimental::SYCL().fence();
-  auto event = USM_memcpy(
-      *Experimental::Impl::SYCLInternal::singleton().m_queue, dst, src, n);
-  Experimental::Impl::SYCLInternal::fence(event);
-}
-}  // namespace
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::Experimental::SYCL>::
-    DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst,
-             const void* src, size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::HostSpace, Kokkos::Experimental::SYCLDeviceUSMSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(const Kokkos::Experimental::SYCL&
-                                                   instance,
-                                               void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(*instance.impl_internal_space_instance(), dst, src, n);
-}
-
-DeepCopy<Kokkos::Experimental::SYCLDeviceUSMSpace, Kokkos::HostSpace,
-         Kokkos::Experimental::SYCL>::DeepCopy(void* dst, const void* src,
-                                               size_t n) {
-  USM_memcpy(dst, src, n);
+void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) {
+  Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n);
+  Experimental::SYCL().fence(
+      "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy");
 }
 
 }  // namespace Impl
@@ -135,6 +93,11 @@ SYCLSharedUSMSpace::SYCLSharedUSMSpace()
 SYCLSharedUSMSpace::SYCLSharedUSMSpace(sycl::queue queue)
     : m_queue(std::move(queue)) {}
 
+SYCLHostUSMSpace::SYCLHostUSMSpace()
+    : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {}
+SYCLHostUSMSpace::SYCLHostUSMSpace(sycl::queue queue)
+    : m_queue(std::move(queue)) {}
+
 void* allocate_sycl(
     const char* arg_label, const size_t arg_alloc_size,
     const size_t arg_logical_size, const Kokkos::Tools::SpaceHandle arg_handle,
@@ -184,6 +147,19 @@ void* SYCLSharedUSMSpace::allocate(const char* arg_label,
       sycl::usm::alloc::shared, m_queue);
 }
 
+void* SYCLHostUSMSpace::allocate(const size_t arg_alloc_size) const {
+  return allocate("[unlabeled]", arg_alloc_size);
+}
+void* SYCLHostUSMSpace::allocate(const char* arg_label,
+                                 const size_t arg_alloc_size,
+                                 const size_t arg_logical_size) const {
+  return allocate_sycl(
+      arg_label, arg_alloc_size, arg_logical_size,
+      Kokkos::Tools::make_space_handle(name()),
+      RawMemoryAllocationFailure::AllocationMechanism::SYCLMallocHost,
+      sycl::usm::alloc::host, m_queue);
+}
+
 void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                      const size_t arg_alloc_size, const size_t arg_logical_size,
                      const Kokkos::Tools::SpaceHandle arg_handle,
@@ -195,6 +171,8 @@ void sycl_deallocate(const char* arg_label, void* const arg_alloc_ptr,
                                       reported_size);
   }
 
+  SYCL::impl_static_fence(
+      "Kokkos::Impl::sycl_deallocate: fence before deallocate");
   sycl::free(arg_alloc_ptr, queue);
 }
 
@@ -223,6 +201,19 @@ void SYCLSharedUSMSpace::deallocate(const char* arg_label,
                   Kokkos::Tools::make_space_handle(name()), m_queue);
 }
 
+void SYCLHostUSMSpace::deallocate(void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size) const {
+  deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size);
+}
+
+void SYCLHostUSMSpace::deallocate(const char* arg_label,
+                                  void* const arg_alloc_ptr,
+                                  const size_t arg_alloc_size,
+                                  const size_t arg_logical_size) const {
+  sycl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size,
+                  Kokkos::Tools::make_space_handle(name()), m_queue);
+}
+
 }  // namespace Experimental
 }  // namespace Kokkos
 
@@ -235,6 +226,9 @@ SharedAllocationRecord<void, void> SharedAllocationRecord<
 
 SharedAllocationRecord<void, void> SharedAllocationRecord<
     Kokkos::Experimental::SYCLSharedUSMSpace, void>::s_root_record;
+
+SharedAllocationRecord<void, void> SharedAllocationRecord<
+    Kokkos::Experimental::SYCLHostUSMSpace, void>::s_root_record;
 #endif
 
 SharedAllocationRecord<Kokkos::Experimental::SYCLDeviceUSMSpace, void>::
@@ -282,6 +276,27 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace, void>::
                                                   arg_label);
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace, void>::
+    SharedAllocationRecord(
+        const Kokkos::Experimental::SYCLHostUSMSpace& arg_space,
+        const std::string& arg_label, const size_t arg_alloc_size,
+        const SharedAllocationRecord<void, void>::function_type arg_dealloc)
+    // Pass through allocated [ SharedAllocationHeader , user_memory ]
+    // Pass through deallocation function
+    : base_t(
+#ifdef KOKKOS_ENABLE_DEBUG
+          &SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                                  void>::s_root_record,
+#endif
+          Impl::checked_allocation_with_header(arg_space, arg_label,
+                                               arg_alloc_size),
+          sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc),
+      m_space(arg_space) {
+
+  this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
+                                                  arg_label);
+}
+
 }  // namespace Impl
 }  // namespace Kokkos
 
@@ -317,6 +332,17 @@ SharedAllocationRecord<Kokkos::Experimental::SYCLSharedUSMSpace,
                      alloc_size, alloc_size - sizeof(SharedAllocationHeader));
 }
 
+SharedAllocationRecord<Kokkos::Experimental::SYCLHostUSMSpace,
+                       void>::~SharedAllocationRecord() {
+  const char* label = nullptr;
+  if (Kokkos::Profiling::profileLibraryLoaded()) {
+    label = RecordBase::m_alloc_ptr->m_label;
+  }
+  const auto alloc_size = SharedAllocationRecord<void, void>::m_alloc_size;
+  m_space.deallocate(label, SharedAllocationRecord<void, void>::m_alloc_ptr,
+                     alloc_size, alloc_size - sizeof(SharedAllocationHeader));
+}
+
 //----------------------------------------------------------------------------
 
 }  // namespace Impl
@@ -339,6 +365,8 @@ template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLDeviceUSMSpace>;
 template class SharedAllocationRecordCommon<
     Kokkos::Experimental::SYCLSharedUSMSpace>;
+template class SharedAllocationRecordCommon<
+    Kokkos::Experimental::SYCLHostUSMSpace>;
 
 }  // namespace Impl
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
index a30cf2109a..c405ad31a5 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp
@@ -92,14 +92,12 @@ class SYCLTeamMember {
     return m_item.get_group_linear_id();
   }
   KOKKOS_INLINE_FUNCTION int league_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
-    return m_item.get_group_range(0);
+    return m_item.get_group_range(1);
   }
   KOKKOS_INLINE_FUNCTION int team_rank() const {
-    return m_item.get_local_linear_id();
+    return m_item.get_local_id(0);
   }
   KOKKOS_INLINE_FUNCTION int team_size() const {
-    // FIXME_SYCL needs to be revised for vector_length>1.
     return m_item.get_local_range(0);
   }
   KOKKOS_INLINE_FUNCTION void team_barrier() const { m_item.barrier(); }
@@ -109,8 +107,17 @@ class SYCLTeamMember {
   //--------------------------------------------------------------------------
 
   template <class ValueType>
-  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& val,
-                                             const int thread_id) const {
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
+    val = sycl::group_broadcast(m_item.get_group(), val,
+                                sycl::id<2>(thread_id, 0));
+  }
+
+  // FIXME_SYCL remove/adapt this overload once the Intel oneAPI implementation
+  // is conforming to the SYCL2020 standard (allowing trivially-copyable types)
+  template <class ValueType>
+  KOKKOS_INLINE_FUNCTION std::enable_if_t<!std::is_arithmetic_v<ValueType>>
+  team_broadcast(ValueType& val, const int thread_id) const {
     // Wait for shared data write until all threads arrive here
     m_item.barrier(sycl::access::fence_space::local_space);
     if (m_item.get_local_id(1) == 0 &&
@@ -119,7 +126,7 @@ class SYCLTeamMember {
     }
     // Wait for shared data read until root thread writes
     m_item.barrier(sycl::access::fence_space::local_space);
-    val = *static_cast<ValueType*>(m_team_reduce);
+    val = *(static_cast<ValueType*>(m_team_reduce));
   }
 
   template <class Closure, class ValueType>
@@ -294,35 +301,43 @@ class SYCLTeamMember {
   //----------------------------------------
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& reducer) {
+      vector_reduce(ReducerType const& reducer) const {
     vector_reduce(reducer, reducer.reference());
   }
 
   template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
+  KOKKOS_INLINE_FUNCTION
       typename std::enable_if<is_reducer<ReducerType>::value>::type
-      vector_reduce(ReducerType const& /*reducer*/,
-                    typename ReducerType::value_type& /*value*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
-  }
+      vector_reduce(ReducerType const& reducer,
+                    typename ReducerType::value_type& value) const {
+    const auto tidx1   = m_item.get_local_id(1);
+    const auto grange1 = m_item.get_local_range(1);
 
-  //--------------------------------------------------------------------------
-  /**\brief  Global reduction across all blocks
-   *
-   *  Return !0 if reducer contains the final value
-   */
-  template <typename ReducerType>
-  KOKKOS_INLINE_FUNCTION static
-      typename std::enable_if<is_reducer<ReducerType>::value, int>::type
-      global_reduce(ReducerType const& /*reducer*/,
-                    int* const /*global_scratch_flags*/,
-                    void* const /*global_scratch_space*/, void* const /*shmem*/,
-                    int const /*shmem_size*/) {
-    // FIXME_SYCL
-    Kokkos::abort("Not implemented!");
+    const auto sg = m_item.get_sub_group();
+
+    if (grange1 == 1) return;
+
+    // Intra vector lane shuffle reduction:
+    typename ReducerType::value_type tmp(value);
+    typename ReducerType::value_type tmp2 = tmp;
+
+    for (int i = grange1; (i >>= 1);) {
+      tmp2 = sg.shuffle_down(tmp, i);
+      if (static_cast<int>(tidx1) < i) {
+        reducer.join(tmp, tmp2);
+      }
+    }
+
+    // Broadcast from root lane to all other lanes.
+    // Cannot use "butterfly" algorithm to avoid the broadcast
+    // because floating point summation is not associative
+    // and thus different threads could have different results.
+
+    tmp2  = sg.shuffle(tmp, (sg.get_local_id() / grange1) * grange1);
+    value = tmp2;
+    reducer.reference() = tmp2;
   }
 
   //----------------------------------------
@@ -489,7 +504,6 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -516,7 +530,6 @@ KOKKOS_INLINE_FUNCTION
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -546,7 +559,6 @@ KOKKOS_INLINE_FUNCTION
 
   reducer.init(reducer.reference());
 
-  // FIXME_SYCL Fix for vector_length>1.
   for (iType i = loop_boundaries.start +
                  loop_boundaries.member.item().get_local_id(0);
        i < loop_boundaries.end;
@@ -609,11 +621,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::TeamVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYCL adapt for vector_length != 1
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0))
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
     closure(i);
 }
 
@@ -623,17 +638,20 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   typename ReducerType::value_type value;
   reducer.init(value);
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
-    closure(i, value);
-  }
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
 
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
+    closure(i, value);
+
+  loop_boundaries.member.vector_reduce(reducer, value);
   loop_boundaries.member.team_reduce(reducer, value);
 }
 
@@ -643,20 +661,23 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember>& loop_boundaries,
                     const Closure& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   ValueType val;
   Kokkos::Sum<ValueType> reducer(val);
 
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start +
-                 loop_boundaries.member.item().get_local_id(0);
-       i < loop_boundaries.end;
-       i += loop_boundaries.member.item().get_local_range(0)) {
-    closure(i, val);
-  }
+  const iType tidx0 = loop_boundaries.member.item().get_local_id(0);
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
 
-  loop_boundaries.member.team_reduce(reducer, val);
+  const iType grange0 = loop_boundaries.member.item().get_local_range(0);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1;
+       i < loop_boundaries.end; i += grange0 * grange1)
+    closure(i, val);
+
+  loop_boundaries.member.vector_reduce(reducer);
+  loop_boundaries.member.team_reduce(reducer);
   result = reducer.reference();
 }
 
@@ -673,9 +694,14 @@ KOKKOS_INLINE_FUNCTION void parallel_for(
     const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::SYCLTeamMember>&
         loop_boundaries,
     const Closure& closure) {
-  // FIXME_SYC: adapt for vector_length!=1
-  for (auto i = loop_boundaries.start; i != loop_boundaries.end; ++i)
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i);
+
+  loop_boundaries.member.item().get_sub_group().barrier();
 }
 
 //----------------------------------------------------------------------------
@@ -697,12 +723,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ReducerType const& reducer) {
-  // FIXME_SYCL adapt for vector_length != 1
   reducer.init(reducer.reference());
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, reducer.reference());
-  }
+
+  loop_boundaries.member.vector_reduce(reducer);
 }
 
 /** \brief  Intra-thread vector parallel_reduce.
@@ -722,12 +752,16 @@ KOKKOS_INLINE_FUNCTION
     parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct<
                         iType, Impl::SYCLTeamMember> const& loop_boundaries,
                     Closure const& closure, ValueType& result) {
-  // FIXME_SYCL adapt for vector_length != 1
   result = ValueType();
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
+  const iType tidx1 = loop_boundaries.member.item().get_local_id(1);
+  const int grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end;
+       i += grange1)
     closure(i, result);
-  }
+
+  loop_boundaries.member.vector_reduce(Kokkos::Sum<ValueType>(result));
 }
 
 //----------------------------------------------------------------------------
@@ -746,15 +780,59 @@ KOKKOS_INLINE_FUNCTION
     parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<
                       iType, Impl::SYCLTeamMember>& loop_boundaries,
                   const Closure& closure, const ReducerType& reducer) {
-  // FIXME_SYCL modify for vector_length!=1
   using value_type = typename Kokkos::Impl::FunctorAnalysis<
       Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type;
 
   value_type accum;
   reducer.init(accum);
+  const value_type identity = accum;
 
-  for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) {
-    closure(i, accum, true);
+  // Loop through boundaries by vector-length chunks must scan at each iteration
+
+  // All thread "lanes" must loop the same number of times.
+  // Determine an loop end for all thread "lanes."
+  // Requires:
+  //   grange1 is power of two and thus
+  //     ( end % grange1 ) == ( end & ( grange1 - 1 ) )
+  //   1 <= grange1 <= sub_group size
+
+  const iType tidx1   = loop_boundaries.member.item().get_local_id(1);
+  const iType grange1 = loop_boundaries.member.item().get_local_range(1);
+
+  const int mask          = grange1 - 1;
+  const int rem           = loop_boundaries.end & mask;  // == end % grange1
+  const int end           = loop_boundaries.end + (rem ? grange1 - rem : 0);
+  const auto sg           = loop_boundaries.member.item().get_sub_group();
+  const int vector_offset = (sg.get_local_id() / grange1) * grange1;
+
+  for (int i = tidx1; i < end; i += grange1) {
+    value_type val = identity;
+
+    // First acquire per-lane contributions.
+    // This sets i's val to i-1's contribution to make the latter shfl_up an
+    // exclusive scan -- the final accumulation of i's val will be included in
+    // the second closure call later.
+    if (i < loop_boundaries.end && tidx1 > 0) closure(i - 1, val, false);
+
+    // Bottom up exclusive scan in triangular pattern where each SYCL thread is
+    // the root of a reduction tree from the zeroth "lane" to itself.
+    //  [t] += [t-1] if t >= 1
+    //  [t] += [t-2] if t >= 2
+    //  [t] += [t-4] if t >= 4
+    //  ...
+    for (int j = 1; j < static_cast<int>(grange1); j <<= 1) {
+      value_type tmp = sg.shuffle_up(val, j);
+      if (j <= static_cast<int>(tidx1)) {
+        reducer.join(val, tmp);
+      }
+    }
+
+    // Include accumulation
+    reducer.join(val, accum);
+
+    // Update i's contribution into the val and add it to accum for next round
+    if (i < loop_boundaries.end) closure(i, val, true);
+    accum = sg.shuffle(val, mask + vector_offset);
   }
 }
 
@@ -792,21 +870,26 @@ template <class FunctorType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda) {
-  if (single_struct.team_member.team_rank() == 0) lambda();
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda();
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::VectorSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.item().get_local_id(1) == 0) lambda(val);
+  const sycl::nd_item<2> item = single_struct.team_member.item();
+  const auto grange1          = item.get_local_range(1);
+  const auto sg               = item.get_sub_group();
+  if (item.get_local_id(1) == 0) lambda(val);
+  val = sg.shuffle(val, (sg.get_local_id() / grange1) * grange1);
 }
 
 template <class FunctorType, class ValueType>
 KOKKOS_INLINE_FUNCTION void single(
     const Impl::ThreadSingleStruct<Impl::SYCLTeamMember>& single_struct,
     const FunctorType& lambda, ValueType& val) {
-  if (single_struct.team_member.team_rank() == 0) lambda(val);
+  if (single_struct.team_member.item().get_local_linear_id() == 0) lambda(val);
+  single_struct.team_member.team_broadcast(val, 0);
 }
 
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
index 141a692f60..d2820b3b3a 100644
--- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
+++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp
@@ -89,7 +89,7 @@ class UniqueToken<SYCL, UniqueTokenScope::Global> {
     const Kokkos::pair<int, int> result =
         Kokkos::Impl::concurrent_bitset::acquire_bounded(
             m_buffer, m_count
-#if defined(KOKKOS_ARCH_INTEL_GEN)
+#ifdef KOKKOS_ARCH_INTEL_GPU
             ,
             Kokkos::Impl::clock_tic() % m_count
 #endif
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
index 92bd671bd5..18ef97ae46 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -288,21 +288,46 @@ int ThreadsExec::in_parallel() {
   return s_current_function && (&s_threads_process != s_current_function_arg) &&
          (s_threads_process.m_pool_base || !is_process());
 }
+void ThreadsExec::fence() { internal_fence(Impl::fence_is_static::yes); }
+void ThreadsExec::fence(const std::string &name) {
+  internal_fence(name, Impl::fence_is_static::yes);
+}
+
+void ThreadsExec::internal_fence(Impl::fence_is_static is_static) {
+  internal_fence((is_static == Impl::fence_is_static::no)
+                     ? "Kokkos::ThreadsExec::fence: Unnamed Instance Fence"
+                     : "Kokkos::ThreadsExec::fence: Unnamed Global Fence",
+                 is_static);
+}
 
 // Wait for root thread to become inactive
-void ThreadsExec::fence() {
-  if (s_thread_pool_size[0]) {
-    // Wait for the root thread to complete:
-    Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
-                                    ThreadsExec::Active);
+void ThreadsExec::internal_fence(const std::string &name,
+                                 Impl::fence_is_static is_static) {
+  const auto &fence_lam = [&]() {
+    if (s_thread_pool_size[0]) {
+      // Wait for the root thread to complete:
+      Impl::spinwait_while_equal<int>(s_threads_exec[0]->m_pool_state,
+                                      ThreadsExec::Active);
+    }
+
+    s_current_function     = nullptr;
+    s_current_function_arg = nullptr;
+
+    // Make sure function and arguments are cleared before
+    // potentially re-activating threads with a subsequent launch.
+    memory_fence();
+  };
+  if (is_static == Impl::fence_is_static::yes) {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name,
+        Kokkos::Tools::Experimental::SpecialSynchronizationCases::
+            GlobalDeviceSynchronization,
+        fence_lam);
+  } else {
+    Kokkos::Tools::Experimental::Impl::profile_fence_event<Kokkos::Threads>(
+        name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1},
+        fence_lam);
   }
-
-  s_current_function     = nullptr;
-  s_current_function_arg = nullptr;
-
-  // Make sure function and arguments are cleared before
-  // potentially re-activating threads with a subsequent launch.
-  memory_fence();
 }
 
 /** \brief  Begin execution of the asynchronous functor */
@@ -769,7 +794,12 @@ void ThreadsExec::finalize() {
 namespace Kokkos {
 
 int Threads::concurrency() { return impl_thread_pool_size(0); }
-void Threads::fence() const { Impl::ThreadsExec::fence(); }
+void Threads::fence() const {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::no);
+}
+void Threads::fence(const std::string &name) const {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::no);
+}
 
 Threads &Threads::impl_instance(int) {
   static Threads t;
@@ -832,6 +862,9 @@ void ThreadsSpaceInitializer::finalize(const bool all_spaces) {
 }
 
 void ThreadsSpaceInitializer::fence() { Kokkos::Threads::impl_static_fence(); }
+void ThreadsSpaceInitializer::fence(const std::string &name) {
+  Kokkos::Threads::impl_static_fence(name);
+}
 
 void ThreadsSpaceInitializer::print_configuration(std::ostream &msg,
                                                   const bool detail) {
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
index 1c8b3ac5f6..4d9a72a034 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -63,7 +63,6 @@
 
 namespace Kokkos {
 namespace Impl {
-
 class ThreadsExec {
  public:
   // Fan array has log_2(NT) reduction threads plus 2 scan threads
@@ -474,6 +473,12 @@ class ThreadsExec {
 
   static int in_parallel();
   static void fence();
+  static void fence(const std::string &);
+  static void internal_fence(
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
+  static void internal_fence(
+      const std::string &,
+      Impl::fence_is_static is_static = Impl::fence_is_static::yes);
   static bool sleep();
   static bool wake();
 
@@ -635,7 +640,12 @@ inline void Threads::print_configuration(std::ostream &s, const bool detail) {
   Impl::ThreadsExec::print_configuration(s, detail);
 }
 
-inline void Threads::impl_static_fence() { Impl::ThreadsExec::fence(); }
+inline void Threads::impl_static_fence() {
+  Impl::ThreadsExec::internal_fence(Impl::fence_is_static::yes);
+}
+inline void Threads::impl_static_fence(const std::string &name) {
+  Impl::ThreadsExec::internal_fence(name, Impl::fence_is_static::yes);
+}
 } /* namespace Kokkos */
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
index 40a09ed22a..e4eaeac781 100644
--- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -100,8 +100,8 @@ bool ThreadsExec::spawn() {
 
   pthread_attr_t attr;
 
-  if (0 == pthread_attr_init(&attr) ||
-      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) ||
+  if (0 == pthread_attr_init(&attr) &&
+      0 == pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSTEM) &&
       0 == pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED)) {
     pthread_t pt;
 
diff --git a/lib/kokkos/core/src/desul/.clang-format b/lib/kokkos/core/src/desul/.clang-format
new file mode 100644
index 0000000000..9d159247d5
--- /dev/null
+++ b/lib/kokkos/core/src/desul/.clang-format
@@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: false
diff --git a/lib/kokkos/core/src/desul/atomics.hpp b/lib/kokkos/core/src/desul/atomics.hpp
new file mode 100644
index 0000000000..ab3fe25392
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics.hpp
@@ -0,0 +1,19 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_HPP_
+#define DESUL_ATOMICS_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#include "desul/atomics/Atomic_Ref.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp b/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
new file mode 100644
index 0000000000..73cd01a7e6
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Atomic_Ref.hpp
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMIC_REF_IMPL_HPP_
+#define DESUL_ATOMIC_REF_IMPL_HPP_
+
+#include <cstddef>
+#include <memory>
+#include <type_traits>
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Generic.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+
+// TODO current implementation is missing the following:
+// * member functions
+//   * wait
+//   * notify_one
+//   * notify_all
+
+template <typename T,
+          typename MemoryOrder,
+          typename MemoryScope,
+          bool = std::is_integral<T>{},
+          bool = std::is_floating_point<T>{}>
+struct basic_atomic_ref;
+
+// base class for non-integral, non-floating-point, non-pointer types
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, false> {
+  static_assert(std::is_trivially_copyable<T>{}, "");
+
+ private:
+  T* _ptr;
+
+  // 1/2/4/8/16-byte types must be aligned to at least their size
+  static constexpr int _min_alignment = (sizeof(T) & (sizeof(T) - 1)) || sizeof(T) > 16
+                                            ? 0
+                                            : sizeof(T);
+
+ public:
+  using value_type = T;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = _min_alignment > alignof(T)
+                                                        ? _min_alignment
+                                                        : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(std::addressof(obj)) {}
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+};
+
+// base class for atomic_ref<integral-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, true, false> {
+  static_assert(std::is_integral<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = sizeof(T) > alignof(T) ? sizeof(T)
+                                                                           : alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_and(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_and(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_or(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_or(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_xor(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_xor(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, value_type(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator&=(value_type arg) const noexcept {
+    atomic_and_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator|=(value_type arg) const noexcept {
+    atomic_or_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator^=(value_type arg) const noexcept {
+    atomic_xor_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<floating-point-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T, MemoryOrder, MemoryScope, false, true> {
+  static_assert(std::is_floating_point<T>{}, "");
+
+ private:
+  T* _ptr;
+
+ public:
+  using value_type = T;
+  using difference_type = value_type;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T& obj) : _ptr(&obj) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T operator=(T desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T exchange(T desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T& expected,
+                                            T desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected,
+      T desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T& expected, T desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, arg, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(value_type arg, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, arg, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator+=(value_type arg) const noexcept {
+    atomic_add_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(value_type arg) const noexcept {
+    atomic_sub_fetch(_ptr, arg, MemoryOrder(), MemoryScope());
+  }
+};
+
+// base class for atomic_ref<pointer-type>
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct basic_atomic_ref<T*, MemoryOrder, MemoryScope, false, false> {
+ private:
+  T** _ptr;
+
+ public:
+  using value_type = T*;
+  using difference_type = std::ptrdiff_t;
+
+  static constexpr bool is_always_lock_free = atomic_always_lock_free(sizeof(T));
+
+  static constexpr std::size_t required_alignment = alignof(T*);
+
+  basic_atomic_ref() = delete;
+  basic_atomic_ref& operator=(basic_atomic_ref const&) = delete;
+
+  explicit basic_atomic_ref(T*& arg) : _ptr(std::addressof(arg)) {}
+
+  basic_atomic_ref(basic_atomic_ref const&) = default;
+
+  T* operator=(T* desired) const noexcept {
+    this->store(desired);
+    return desired;
+  }
+
+  operator T*() const noexcept { return this->load(); }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION void store(T* desired,
+                            _MemoryOrder order = _MemoryOrder()) const noexcept {
+    atomic_store(_ptr, desired, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* load(_MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION T* exchange(T* desired,
+                             _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_load(_ptr, desired, order, MemoryScope());
+  }
+
+  DESUL_FUNCTION bool is_lock_free() const noexcept {
+    return atomic_is_lock_free<sizeof(T*), required_alignment>();
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(T*& expected,
+                                            T* desired,
+                                            SuccessMemoryOrder success,
+                                            FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_weak(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_weak(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_weak(expected,
+                          desired,
+                          order,
+                          cmpexch_failure_memory_order<_MemoryOrder>(),
+                          MemoryScope());
+  }
+
+  template <typename SuccessMemoryOrder, typename FailureMemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected,
+      T* desired,
+      SuccessMemoryOrder success,
+      FailureMemoryOrder failure) const noexcept {
+    return atomic_compare_exchange_strong(
+        _ptr, expected, desired, success, failure, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION bool compare_exchange_strong(
+      T*& expected, T* desired, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return compare_exchange_strong(expected,
+                            desired,
+                            order,
+                            cmpexch_failure_memory_order<_MemoryOrder>(),
+                            MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_add(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_add(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  template <typename _MemoryOrder = MemoryOrder>
+  DESUL_FUNCTION value_type
+  fetch_sub(difference_type d, _MemoryOrder order = _MemoryOrder()) const noexcept {
+    return atomic_fetch_sub(_ptr, _type_size(d), order, MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++() const noexcept {
+    return atomic_add_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator++(int) const noexcept { return fetch_add(1); }
+
+  DESUL_FUNCTION value_type operator--() const noexcept {
+    return atomic_sub_fetch(_ptr, _type_size(1), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator--(int) const noexcept { return fetch_sub(1); }
+
+  DESUL_FUNCTION value_type operator+=(difference_type d) const noexcept {
+    atomic_add_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+  DESUL_FUNCTION value_type operator-=(difference_type d) const noexcept {
+    atomic_sub_fetch(_ptr, _type_size(d), MemoryOrder(), MemoryScope());
+  }
+
+ private:
+  static constexpr std::ptrdiff_t _type_size(std::ptrdiff_t d) noexcept {
+    static_assert(std::is_object<T>{}, "");
+    return d * sizeof(T);
+  }
+};
+
+}  // namespace Impl
+
+template <typename T, typename MemoryOrder, typename MemoryScope>
+struct scoped_atomic_ref : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope> {
+  explicit scoped_atomic_ref(T& obj) noexcept
+      : Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>(obj) {}
+
+  scoped_atomic_ref& operator=(scoped_atomic_ref const&) = delete;
+
+  scoped_atomic_ref(scoped_atomic_ref const&) = default;
+
+  using Impl::basic_atomic_ref<T, MemoryOrder, MemoryScope>::operator=;
+};
+
+}  // namespace desul
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/CUDA.hpp b/lib/kokkos/core/src/desul/atomics/CUDA.hpp
new file mode 100644
index 0000000000..32873a5977
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/CUDA.hpp
@@ -0,0 +1,453 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_CUDA_HPP_
+#define DESUL_ATOMICS_CUDA_HPP_
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+    (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#define DESUL_HAVE_CUDA_ATOMICS_ASM
+#include <desul/atomics/cuda/CUDA_asm.hpp>
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__<700)) || \
+    (!defined(__NVCC__) && !defined(DESUL_HAVE_CUDA_ATOMICS_ASM))
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_cuda_atomic_integer_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value ||
+                                std::is_same<T,unsigned long long int>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_add_type {
+  static constexpr bool value = is_cuda_atomic_integer_type<T>::value ||
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600)
+                                std::is_same<T,double>::value || 
+#endif
+                                std::is_same<T,float>::value;
+};
+
+template<class T>
+struct is_cuda_atomic_sub_type {
+  static constexpr bool value = std::is_same<T,int>::value ||
+                                std::is_same<T,unsigned int>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_add_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Sub
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_sub_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicInc(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_inc(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicDec(dest,val);
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class MemoryOrder>
+__device__ inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_dec(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+
+// Atomic Max
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic Min
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic And
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic XOR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+
+// Atomic OR
+template<class T>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest,val);
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest,val);
+  __threadfence();
+  return return_val;
+}
+
+template<class T, class MemoryOrder>
+__device__ inline
+typename std::enable_if<Impl::is_cuda_atomic_integer_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest,val,MemoryOrder(),MemoryScopeDevice());
+}
+} // desul
+#endif
+
+#if !defined(__NVCC__)
+// Functions defined as device functions in CUDA which don't exist in the GCC overload set
+namespace desul {
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS_ASM)
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(TYPE,ORDER,SCOPE) \
+    inline void atomic_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_add(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(TYPE,ORDER,SCOPE) \
+    inline void atomic_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_sub(dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(int32_t,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_INC(TYPE,ORDER,SCOPE) \
+    inline void atomic_inc(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_inc(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_INC(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(TYPE,ORDER,SCOPE) \
+    inline void atomic_dec(TYPE* const dest, ORDER order, SCOPE scope) { \
+    (void) atomic_fetch_dec(dest, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_DEC(unsigned,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+#endif // DESUL_HAVE_CUDA_ATOMICS_ASM
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_add(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::AddOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_ADD(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_sub(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::SubOper<TYPE, const TYPE>(),dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(float,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_SUB(double,MemoryOrderRelaxed,MemoryScopeDevice);
+
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_max(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MaxOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MAX(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+
+  #define DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(TYPE,ORDER,SCOPE) \
+    inline TYPE atomic_fetch_min(TYPE* const dest, TYPE val, ORDER order, SCOPE scope) { \
+      return Impl::atomic_fetch_oper(Impl::MinOper<TYPE, const TYPE>(), dest, val, order, scope); \
+  }
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(long,MemoryOrderRelaxed,MemoryScopeDevice); // only for ASM?
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned int,MemoryOrderRelaxed,MemoryScopeDevice);
+  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  DESUL_IMPL_CUDA_HOST_ATOMIC_FETCH_MIN(unsigned long long,MemoryOrderRelaxed,MemoryScopeDevice);
+//  inline void atomic_fetch_max(int32_t* const dest, int32_t val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+
+}
+
+// Functions defined int the GCC overload set but not in the device overload set
+namespace desul {
+  __device__ inline
+  unsigned long long atomic_fetch_add(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_add(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_add(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_sub(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_sub(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_max(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MaxOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_min(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::MinOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_or(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_or(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_xor(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_xor(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_fetch_and(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_fetch_and(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_fetch_oper(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+
+
+  __device__ inline
+  unsigned long long atomic_add_fetch(unsigned long long* const dest, unsigned long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<unsigned long long, const unsigned long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_add_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_add_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AddOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_sub_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_sub_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::SubOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_or_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_or_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::OrOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_xor_fetch(long long* const dest, long long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_xor_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::XorOper<long, const long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long long atomic_and_fetch(long long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long long, const long long>(), dest, val, order, scope);
+  }
+  __device__ inline
+  long atomic_and_fetch(long* const dest, long val, MemoryOrderRelaxed order, MemoryScopeDevice scope) {
+    return Impl::atomic_oper_fetch(Impl::AndOper<long, const long>(), dest, val, order, scope);
+  }
+}
+#endif
+#endif  // DESUL_HAVE_CUDA_ATOMICS
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Common.hpp b/lib/kokkos/core/src/desul/atomics/Common.hpp
new file mode 100644
index 0000000000..f1dccc6c52
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Common.hpp
@@ -0,0 +1,199 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMMON_HPP_
+#define DESUL_ATOMICS_COMMON_HPP_
+#include "desul/atomics/Macros.hpp"
+#include <cstdint>
+#include <atomic>
+#include <type_traits>
+
+namespace desul {
+struct alignas(16) Dummy16ByteValue {
+  int64_t value1;
+  int64_t value2;
+  bool operator!=(Dummy16ByteValue v) const {
+    return (value1 != v.value1) || (value2 != v.value2);
+  }
+  bool operator==(Dummy16ByteValue v) const {
+    return (value1 == v.value1) && (value2 == v.value2);
+  }
+};
+}  // namespace desul
+
+// MemoryOrder Tags
+
+namespace desul {
+// Memory order sequential consistent
+struct MemoryOrderSeqCst {};
+// Memory order acquire release
+struct MemoryOrderAcqRel {};
+// Memory order acquire
+struct MemoryOrderAcquire {};
+// Memory order release
+struct MemoryOrderRelease {};
+// Memory order relaxed
+struct MemoryOrderRelaxed {};
+}  // namespace desul
+
+// Memory Scope Tags
+
+namespace desul {
+// Entire machine scope (e.g. for global arrays)
+struct MemoryScopeSystem {};
+// Node level
+struct MemoryScopeNode {};
+// Device or socket scope (i.e. a CPU socket, a single GPU)
+struct MemoryScopeDevice {};
+// Core scoped (i.e. a shared Level 1 cache)
+struct MemoryScopeCore {};
+}  // namespace desul
+
+#ifndef __ATOMIC_RELAXED
+#define __ATOMIC_RELAXED 0
+#define __ATOMIC_CONSUME 1
+#define __ATOMIC_ACQUIRE 2
+#define __ATOMIC_RELEASE 3
+#define __ATOMIC_ACQ_REL 4
+#define __ATOMIC_SEQ_CST 5
+#endif
+
+namespace desul {
+template <class MemoryOrderDesul>
+struct GCCMemoryOrder;
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr int value = __ATOMIC_RELAXED;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcquire> {
+  static constexpr int value = __ATOMIC_ACQUIRE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderRelease> {
+  static constexpr int value = __ATOMIC_RELEASE;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr int value = __ATOMIC_ACQ_REL;
+};
+
+template <>
+struct GCCMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr int value = __ATOMIC_SEQ_CST;
+};
+
+template <class MemoryOrderDesul>
+struct CXXMemoryOrder;
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr std::memory_order value = std::memory_order_relaxed;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcquire> {
+  static constexpr std::memory_order value = std::memory_order_acquire;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderRelease> {
+  static constexpr std::memory_order value = std::memory_order_release;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr std::memory_order value = std::memory_order_acq_rel;
+};
+
+template <>
+struct CXXMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr std::memory_order value = std::memory_order_seq_cst;
+};
+
+namespace Impl {
+template <typename MemoryOrder>
+struct CmpExchFailureOrder {
+  using memory_order = std::conditional_t<
+      std::is_same<MemoryOrder, MemoryOrderAcqRel>{},
+      MemoryOrderAcquire,
+      std::conditional_t<std::is_same<MemoryOrder, MemoryOrderRelease>{},
+                         MemoryOrderRelaxed,
+                         MemoryOrder>>;
+};
+template <typename MemoryOrder>
+using cmpexch_failure_memory_order =
+    typename CmpExchFailureOrder<MemoryOrder>::memory_order;
+}  // namespace Impl
+
+}
+
+// We should in principle use std::numeric_limits, but that requires constexpr function support on device
+// Currently that is still considered experimetal on CUDA and sometimes not reliable.
+namespace desul {
+namespace Impl {
+template<class T>
+struct numeric_limits_max;
+
+template<>
+struct numeric_limits_max<uint32_t> {
+  static constexpr uint32_t value = 0xffffffffu;
+};
+template<>
+struct numeric_limits_max<uint64_t> {
+  static constexpr uint64_t value = 0xfffffffflu;
+};
+
+constexpr bool atomic_always_lock_free(std::size_t size) {
+  return size == 4 || size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || size == 16
+#endif
+      ;
+}
+
+template <std::size_t Size, std::size_t Align>
+DESUL_INLINE_FUNCTION bool atomic_is_lock_free() noexcept {
+  return Size == 4 || Size == 8
+#if defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP)
+         || Size == 16
+#endif
+      ;
+}
+
+template<std::size_t N>
+struct atomic_compare_exchange_type;
+
+template<>
+struct atomic_compare_exchange_type<4> {
+  using type = int32_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<8> {
+  using type = int64_t;
+};
+
+template<>
+struct atomic_compare_exchange_type<16> {
+  using type = Dummy16ByteValue;
+};
+
+template<class T>
+struct dont_deduce_this_parameter { using type = T; };
+
+template<class T>
+using dont_deduce_this_parameter_t = typename dont_deduce_this_parameter<T>::type;
+
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
new file mode 100644
index 0000000000..7b8289d75b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange.hpp
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HPP_
+
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#include "desul/atomics/Compare_Exchange_GCC.hpp"
+#endif
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+#include "desul/atomics/Compare_Exchange_MSVC.hpp"
+#endif
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+#include "desul/atomics/Compare_Exchange_Serial.hpp"
+#endif
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#include "desul/atomics/Compare_Exchange_CUDA.hpp"
+#endif
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#include "desul/atomics/Compare_Exchange_HIP.hpp"
+#endif
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+#include "desul/atomics/Compare_Exchange_OpenMP.hpp"
+#endif
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Compare_Exchange_SYCL.hpp"
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
new file mode 100644
index 0000000000..aab0d943eb
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_CUDA.hpp
@@ -0,0 +1,267 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_CUDA_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+namespace desul {
+// Only include if compiling device code, or the CUDA compiler is not NVCC (i.e. Clang)
+// atomic_thread_fence implementation
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+#if (__CUDA_ARCH__>=600) || !defined(__NVCC__)
+__device__ inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+__device__ inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+#endif
+#endif
+}
+
+// Compare Exchange for PRE Volta, not supported with CLANG as CUDA compiler, since we do NOT have a way
+// of having the code included for clang only when the CC is smaller than 700
+// But on Clang the device side symbol list must be independent of __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) || \
+(!defined(__NVCC__) && (defined(KOKKOS_ENABLE_KEPLER) || defined(KOKKOS_ENABLE_MAXWELL) || defined(KOKKOS_ENABLE_PASCAL)))
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+}  // namespace desul
+#endif
+
+// Including CUDA ptx based exchange atomics
+// When building with clang we need to include the device functions always
+// since clang must see a consistent overload set in both device and host compilation
+// but that means we need to know on the host what to make visible, i.e. we need
+// a host side compile knowledge of architecture.
+// We simply can say DESUL proper doesn't support clang CUDA build pre Volta,
+// Kokkos has that knowledge and so I use it here, allowing in Kokkos to use
+// clang with pre Volta as CUDA compiler
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__>=700)) || \
+     (!defined(__NVCC__) && !defined(KOKKOS_ARCH_KEPLER) && !defined(KOKKOS_ARCH_MAXWELL) && !defined(KOKKOS_ARCH_PASCAL))
+#include <desul/atomics/cuda/CUDA_asm_exchange.hpp>
+#endif
+
+// SeqCst is not directly supported by PTX, need the additional fences:
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_exchange(dest,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T return_val = atomic_compare_exchange(dest,compare,value,MemoryOrderRelaxed(),MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+}
+#endif
+
+#if defined(__CUDA_ARCH__) || !defined(__NVCC__)
+namespace desul {
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        if(return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(),scope);
+        }
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+__device__ typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        if(std::is_same<MemoryOrder,MemoryOrderSeqCst>::value) atomic_thread_fence(MemoryOrderRelease(),scope);
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+}
+}
+#endif
+
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
new file mode 100644
index 0000000000..418bea0b8b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_GCC.hpp
@@ -0,0 +1,91 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_GCC_HPP_
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+#if !defined(DESUL_HAVE_16BYTE_COMPARE_AND_SWAP) && !defined(__CUDACC__)
+// This doesn't work in WSL??
+//#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+namespace desul {
+
+namespace Impl {
+template<class T>
+struct atomic_exchange_available_gcc {
+  constexpr static bool value =
+#ifndef DESUL_HAVE_LIBATOMIC
+    ((sizeof(T)==4 && alignof(T)==4) ||
+#ifdef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+     (sizeof(T)==16 && alignof(T)==16) ||
+#endif
+     (sizeof(T)==8 && alignof(T)==8)) &&
+#endif
+    std::is_trivially_copyable<T>::value;
+};
+} //namespace Impl
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+// https://godbolt.org/z/G7YhqhbG6
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  __atomic_thread_fence(GCCMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  __atomic_exchange(
+     dest, &value, &return_val, GCCMemoryOrder<MemoryOrder>::value);
+  return return_val;
+}
+
+// Failure mode for atomic_compare_exchange_n cannot be RELEASE nor ACQREL so
+// Those two get handled separatly.
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+std::enable_if_t<Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, __ATOMIC_ACQ_REL, __ATOMIC_ACQUIRE);
+  return compare;
+}
+
+#if defined(__clang__) && (__clang_major__>=7) && !defined(__APPLE__)
+#pragma GCC diagnostic pop
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
new file mode 100644
index 0000000000..d6bf04a7e6
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_HIP.hpp
@@ -0,0 +1,253 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_HIP_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+namespace desul {
+#if defined(__HIP_DEVICE_COMPILE__)
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  __threadfence();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  __threadfence_block();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderRelease, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+inline __device__ void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeNode) {
+  __threadfence_system();
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicCAS(reinterpret_cast<unsigned int*>(dest),
+                                      reinterpret_cast<unsigned int&>(compare),
+                                      reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicCAS(reinterpret_cast<unsigned long long int*>(dest),
+                reinterpret_cast<unsigned long long int&>(compare),
+                reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4,
+                "this function assumes an unsigned int is 32-bit");
+  unsigned int return_val = atomicExch(reinterpret_cast<unsigned int*>(dest),
+                                       reinterpret_cast<unsigned int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrderRelaxed, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8,
+                "this function assumes an unsigned long long  is 64-bit");
+  unsigned long long int return_val =
+      atomicExch(reinterpret_cast<unsigned long long int*>(dest),
+                 reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T compare, T value, MemoryOrderRelease, MemoryScope) {
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(
+    T* const dest, T /*compare*/, T value, MemoryOrderAcquire, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderAcqRel, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4 || sizeof(T) == 8, T>::type
+atomic_exchange(T* const dest, T value, MemoryOrderSeqCst, MemoryScope) {
+          atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+            T return_val = atomic_exchange(dest, value, MemoryOrderRelaxed(), MemoryScope());
+              atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+                return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryScope>
+__device__ typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  atomic_thread_fence(MemoryOrderAcquire(), MemoryScope());
+  T return_val = atomic_compare_exchange(
+      dest, compare, value, MemoryOrderRelaxed(), MemoryScope());
+  atomic_thread_fence(MemoryOrderRelease(), MemoryScope());
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_compare_exchange(
+        T* const dest, T compare, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        if (return_val == compare) {
+          *dest = value;
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        }
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION __device__
+    typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type
+    atomic_exchange(T* const dest, T value, MemoryOrder, MemoryScope scope) {
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = value;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+}
+#endif
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
new file mode 100644
index 0000000000..c96cb03171
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_MSVC.hpp
@@ -0,0 +1,201 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_MSVC_HPP_
+#include "desul/atomics/Common.hpp"
+#include <type_traits>
+#ifdef DESUL_HAVE_MSVC_ATOMICS
+
+#ifndef DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#define DESUL_HAVE_16BYTE_COMPARE_AND_SWAP
+#endif
+
+namespace desul {
+
+template<class T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(T* const, T val, MemoryOrder, MemoryScope) { return val;}
+
+
+template<class MemoryOrder, class MemoryScope>
+void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  std::atomic_thread_fence(CXXMemoryOrder<MemoryOrder>::value);
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedExchange8((char*)dest, *((char*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedExchange16((short*)dest, *((short*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedExchange((long*)dest, *((long*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedExchange64(
+      (__int64*)dest, *((__int64*)&val));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderRelaxed, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 1, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  char return_val =
+      _InterlockedCompareExchange8((char*)dest, *((char*)&val), *((char*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 2, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  short return_val =
+      _InterlockedCompareExchange16((short*)dest, *((short*)&val), *((short*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  long return_val =
+      _InterlockedCompareExchange((long*)dest, *((long*)&val), *((long*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  __int64 return_val = _InterlockedCompareExchange64(
+      (__int64*)dest, *((__int64*)&val), *((__int64*)&compare));
+  return *(reinterpret_cast<T*>(&return_val));
+}
+
+template <typename T, class MemoryScope>
+typename std::enable_if<sizeof(T) == 16, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T val, MemoryOrderSeqCst, MemoryScope) {
+  Dummy16ByteValue* val16 = reinterpret_cast<Dummy16ByteValue*>(&val);
+  (void)_InterlockedCompareExchange128(reinterpret_cast<__int64*>(dest),
+                                       val16->value2,
+                                       val16->value1,
+                                       (reinterpret_cast<__int64*>(&compare)));
+  return compare;
+}
+
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 1 && sizeof(T) != 4 && sizeof(T) != 8 && sizeof(T) != 16), T>::type atomic_compare_exchange(
+     T* const dest, T compare, T val, MemoryOrder, MemoryScope scope) {
+  while (!Impl::lock_address((void*)dest, scope)) {}
+  if (std::is_same<MemoryOrder, MemoryOrderSeqCst>::value)
+          atomic_thread_fence(MemoryOrderRelease(), scope);
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+}  // namespace desul
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
new file mode 100644
index 0000000000..a1d1c91249
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_OpenMP.hpp
@@ -0,0 +1,145 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_OPENMP_HPP_
+#include "desul/atomics/Common.hpp"
+#include <cstdio>
+#include <omp.h>
+
+namespace desul
+{
+namespace Impl
+{
+static constexpr bool omp_on_host() { return true; }
+
+#pragma omp begin declare variant match(device = {kind(host)})
+static constexpr bool omp_on_host() { return true; }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(nohost)})
+static constexpr bool omp_on_host() { return false; }
+#pragma omp end declare variant
+} // namespace Impl
+} // namespace desul
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+namespace desul {
+
+#if _OPENMP > 201800
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush acquire
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  // There is no seq_cst flush in OpenMP, isn't it the same anyway for fence?
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush acq_rel
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush release
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush acquire
+}
+#else
+// atomic_thread_fence for Core Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeCore) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeCore) {
+  #pragma omp flush
+}
+// atomic_thread_fence for Device Scope
+inline void atomic_thread_fence(MemoryOrderSeqCst, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcqRel, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderRelease, MemoryScopeDevice) {
+  #pragma omp flush
+}
+inline void atomic_thread_fence(MemoryOrderAcquire, MemoryScopeDevice) {
+  #pragma omp flush
+}
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+T atomic_exchange(
+    T* dest, T value, MemoryOrder, MemoryScope) {
+  T return_val;
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderAcquire(),MemoryScope());
+  T& x = *dest;
+  #pragma omp atomic capture
+  { return_val = x; x = value; }
+  if(!std::is_same<MemoryOrder,MemoryOrderRelaxed>::value)
+    atomic_thread_fence(MemoryOrderRelease(),MemoryScope());
+  return return_val;
+}
+
+// OpenMP doesn't have compare exchange, so we use build-ins and rely on testing that this works
+// Note that means we test this in OpenMPTarget offload regions!
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<Impl::atomic_always_lock_free(sizeof(T)),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  using cas_t = typename Impl::atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t retval = __sync_val_compare_and_swap(
+     reinterpret_cast<volatile cas_t*>(dest), 
+     reinterpret_cast<cas_t&>(compare), 
+     reinterpret_cast<cas_t&>(value));
+  return reinterpret_cast<T&>(retval);
+}
+// Make 16 byte cas work on host at least (is_initial_device check, note this requires C++17)
+#if __cplusplus>=201703L
+
+#if defined(__clang__) && (__clang_major__>=7)
+// Disable warning for large atomics on clang 7 and up (checked with godbolt)
+// error: large atomic operation may incur significant performance penalty [-Werror,-Watomic-alignment]
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Watomic-alignment"
+#endif
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_always_lock_free(sizeof(T)) && (sizeof(T)==16),T> atomic_compare_exchange(
+    T* dest, T compare, T value, MemoryOrder, MemoryScope) {
+  if constexpr (desul::Impl::omp_on_host()) {
+    (void)__atomic_compare_exchange(
+      dest, &compare, &value, false, GCCMemoryOrder<MemoryOrder>::value, GCCMemoryOrder<MemoryOrder>::value);
+    return compare;
+  } else {
+    return value;
+  }
+}
+#if defined(__clang__) && (__clang_major__>=7)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
new file mode 100644
index 0000000000..a8fd2ebbe2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_SYCL.hpp
@@ -0,0 +1,102 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SYCL_HPP_
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/SYCLConversions.hpp"
+#include <CL/sycl.hpp>
+
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+
+namespace desul {
+
+template<class MemoryOrder, class MemoryScope>
+inline void atomic_thread_fence(MemoryOrder, MemoryScope) {
+  DESUL_SYCL_NAMESPACE::atomic_fence(DesulToSYCLMemoryOrder<MemoryOrder>::value,
+                                     DesulToSYCLMemoryScope<MemoryScope>::value);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned int*>(&compare), 
+                                   *reinterpret_cast<unsigned int*>(&value));
+  return compare;
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value, 
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  dest_ref.compare_exchange_strong(*reinterpret_cast<unsigned long long int*>(&compare),
+                                   *reinterpret_cast<unsigned long long int*>(&value));
+  return compare;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 4, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned int) == 4, "this function assumes an unsigned int is 32-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned int, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScope>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*reinterpret_cast<unsigned int*>(dest));
+  unsigned int return_val = dest_ref.exchange(*reinterpret_cast<unsigned int*>(&value));
+  return reinterpret_cast<T&>(return_val);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<sizeof(T) == 8, T>::type atomic_exchange(
+    T* const dest, T value, MemoryOrder, MemoryScope) {
+  static_assert(sizeof(unsigned long long int) == 8, "this function assumes an unsigned long long  is 64-bit");
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    unsigned long long int,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScope>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*reinterpret_cast<unsigned long long int*>(dest));
+  unsigned long long int return_val =
+      dest_ref.exchange(reinterpret_cast<unsigned long long int&>(value));
+  return reinterpret_cast<T&>(return_val);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_compare_exchange(
+    T* const /*dest*/, T compare, T /*value*/, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return compare;  
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+typename std::enable_if<(sizeof(T) != 8) && (sizeof(T) != 4), T>::type atomic_exchange(
+    T* const /*dest*/, T value, MemoryOrder, MemoryScope) {
+  // FIXME_SYCL not implemented
+  assert(false);
+  return value;
+}
+
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
new file mode 100644
index 0000000000..be7b46d5fa
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Compare_Exchange_Serial.hpp
@@ -0,0 +1,45 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+#define DESUL_ATOMICS_COMPARE_EXCHANGE_SERIAL_HPP_
+
+#ifdef DESUL_HAVE_SERIAL_ATOMICS
+namespace desul {
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderAcquire, MemoryScope) {
+}
+
+template<class MemoryScope>
+void atomic_thread_fence(MemoryOrderRelease, MemoryScope) {
+}
+
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderRelaxed, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+template <typename T, class MemoryScope>
+T atomic_compare_exchange(
+    T* const dest, T compare, T value, MemoryOrderSeqCst, MemoryScope) {
+  T old = *dest;
+  if (old == compare) {
+    *dest = value;
+  } else {
+    old = compare;
+  }
+  return compare;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/GCC.hpp b/lib/kokkos/core/src/desul/atomics/GCC.hpp
new file mode 100644
index 0000000000..cd0c2bea11
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/GCC.hpp
@@ -0,0 +1,131 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_GCC_HPP_
+#define DESUL_ATOMICS_GCC_HPP_
+
+#ifdef DESUL_HAVE_GCC_ATOMICS
+
+#include<type_traits>
+/*
+Built - in Function : type __atomic_add_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_sub_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_and_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_xor_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_or_fetch(type * ptr, type val, int memorder)
+Built - in Function : type __atomic_nand_fetch(type * ptr, type val, int memorder)
+*/
+
+#define DESUL_GCC_INTEGRAL_OP_ATOMICS(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_add(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_sub(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_and(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_or(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_xor(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_fetch_nand(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_add_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_sub_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_and_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_or_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);   \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_xor_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value);  \
+  }                                                                               \
+  template <typename T>                                                           \
+  typename std::enable_if<std::is_integral<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+    return __atomic_nand_fetch(dest, value, GCCMemoryOrder<MEMORY_ORDER>::value); \
+  }
+
+namespace desul {
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_GCC_INTEGRAL_OP_ATOMICS(MemoryOrderSeqCst, MemoryScopeCore)
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+std::enable_if_t<!Impl::atomic_exchange_available_gcc<T>::value, T>
+atomic_compare_exchange(T* const dest,
+                  Impl::dont_deduce_this_parameter_t<const T> compare,
+                  Impl::dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  if(return_val == compare) {
+    *dest = val;
+    atomic_thread_fence(MemoryOrderRelease(),scope);
+  }
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+}
+}  // namespace desul
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Generic.hpp b/lib/kokkos/core/src/desul/atomics/Generic.hpp
new file mode 100644
index 0000000000..9d5e87ece2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Generic.hpp
@@ -0,0 +1,690 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_GENERIC_HPP_
+#define DESUL_ATOMICS_GENERIC_HPP_
+
+#include <type_traits>
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array.hpp"
+#include "desul/atomics/Macros.hpp"
+// Combination operands to be used in an Compare and Exchange based atomic
+// operation
+namespace desul {
+namespace Impl {
+
+template <class Scalar1, class Scalar2>
+struct MaxOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 > val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 > val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct MinOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return (val1 < val2 ? val1 : val2);
+  }
+  DESUL_FORCEINLINE_FUNCTION
+  static constexpr bool check_early_exit(Scalar1 const& val1, Scalar2 const& val2) {
+    return val1 < val2;
+  }
+};
+
+// This exit early optimization causes weird compiler errors with MSVC 2019
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+template <typename Op, typename Scalar1, typename Scalar2, typename = bool>
+struct may_exit_early : std::false_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+struct may_exit_early<Op,
+                      Scalar1,
+                      Scalar2,
+                      decltype(Op::check_early_exit(std::declval<Scalar1 const&>(),
+                                                    std::declval<Scalar2 const&>()))>
+    : std::true_type {};
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const& val1, Scalar2 const& val2) {
+  return Op::check_early_exit(val1, val2);
+}
+
+template <typename Op, typename Scalar1, typename Scalar2>
+constexpr DESUL_FUNCTION typename std::enable_if<!may_exit_early<Op, Scalar1, Scalar2>::value, bool>::type
+check_early_exit(Op const&, Scalar1 const&, Scalar2 const&) {
+  return false;
+}
+#endif
+
+template <class Scalar1, class Scalar2>
+struct AddOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 + val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct SubOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 - val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct MulOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 * val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct DivOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 / val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct ModOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 % val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct AndOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 & val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct OrOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 | val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct XorOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { return val1 ^ val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct NandOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return ~(val1 & val2);
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct LShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 << val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct RShiftOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1 >> val2;
+  }
+};
+
+template <class Scalar1, class Scalar2>
+struct StoreOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1&, const Scalar2& val2) { return val2; }
+};
+
+template <class Scalar1, class Scalar2>
+struct LoadOper {
+  DESUL_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2&) { return val1; }
+};
+
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  cas_t assume = oldval;
+
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    T newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return reinterpret_cast<T&>(oldval);
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder order,
+                  MemoryScope scope) {
+  using cas_t = typename atomic_compare_exchange_type<sizeof(T)>::type;
+  cas_t oldval = reinterpret_cast<cas_t&>(*dest);
+  T newval = val;
+  cas_t assume = oldval;
+  do {
+#ifndef DESUL_HAVE_MSVC_ATOMICS
+    if (Impl::check_early_exit(op, reinterpret_cast<T&>(oldval), val)) return reinterpret_cast<T&>(oldval);
+#endif
+    assume = oldval;
+    newval = op.apply(reinterpret_cast<T&>(assume), val);
+    oldval = desul::atomic_compare_exchange(
+        reinterpret_cast<cas_t*>(dest), assume, reinterpret_cast<cas_t&>(newval), order, scope);
+  } while (assume != oldval);
+
+  return newval;
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_oper(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = *dest;
+  *dest = op.apply(return_val, val);
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+// FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) return_val;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = *dest;
+        *dest = op.apply(return_val, val);
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based attomic\n");
+  return val;
+#endif
+}
+
+template <class Oper, typename T, class MemoryOrder, class MemoryScope,
+  // equivalent to:
+  //   requires !atomic_always_lock_free(sizeof(T))
+  std::enable_if_t<!atomic_always_lock_free(sizeof(T)), int> = 0
+>
+DESUL_INLINE_FUNCTION T
+atomic_oper_fetch(const Oper& op,
+                  T* const dest,
+                  dont_deduce_this_parameter_t<const T> val,
+                  MemoryOrder /*order*/,
+                  MemoryScope scope) {
+#if defined(DESUL_HAVE_FORWARD_PROGRESS)
+  // Acquire a lock for the address
+  while (!Impl::lock_address((void*)dest, scope)) {}
+
+  atomic_thread_fence(MemoryOrderAcquire(),scope);
+  T return_val = op.apply(*dest, val);
+  *dest = return_val;
+  atomic_thread_fence(MemoryOrderRelease(),scope);
+  Impl::unlock_address((void*)dest, scope);
+  return return_val;
+#elif defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+  // This is a way to avoid dead lock in a warp or wave front
+  T return_val;
+  int done = 0;
+#ifdef __HIPCC__
+  unsigned long long int active = DESUL_IMPL_BALLOT_MASK(1);
+  unsigned long long int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_hip((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(), scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(), scope);
+        Impl::unlock_address_hip((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(done);
+  }
+  return return_val;
+  // FIXME_SYCL not implemented
+#elif defined(__SYCL_DEVICE_ONLY__)
+  (void) op;
+  (void) dest;
+  (void) scope;
+  (void) done;
+
+  assert(false);
+  return val;
+#else
+  unsigned int mask = DESUL_IMPL_ACTIVEMASK;
+  unsigned int active = DESUL_IMPL_BALLOT_MASK(mask, 1);
+  unsigned int done_active = 0;
+  while (active != done_active) {
+    if (!done) {
+      if (Impl::lock_address_cuda((void*)dest, scope)) {
+        atomic_thread_fence(MemoryOrderAcquire(),scope);
+        return_val = op.apply(*dest, val);
+        *dest = return_val;
+        atomic_thread_fence(MemoryOrderRelease(),scope);
+        Impl::unlock_address_cuda((void*)dest, scope);
+        done = 1;
+      }
+    }
+    done_active = DESUL_IMPL_BALLOT_MASK(mask, done);
+  }
+  return return_val;
+#endif
+#else
+  static_assert(false, "Unimplemented lock based atomic\n");
+  return val;
+#endif
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+namespace desul {
+
+// Fetch_Oper atomics: return value before operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_add(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_sub(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_max(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_min(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mul(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_div(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_mod(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_and(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_or(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_xor(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_fetch_nand(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_lshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_rshift(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_fetch_oper(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Oper Fetch atomics: return value after operation
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_add_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_sub_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_max_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MaxOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_min_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MinOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mul_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_div_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_mod_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_and_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_or_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_xor_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_nand_fetch(T* const dest, const T val, MemoryOrder order, MemoryScope scope) {
+  return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_lshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::LShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_rshift_fetch(T* const dest,
+                                            const unsigned int val,
+                                            MemoryOrder order,
+                                            MemoryScope scope) {
+  return Impl::atomic_oper_fetch(
+      Impl::RShiftOper<T, const unsigned int>(), dest, val, order, scope);
+}
+
+// Other atomics
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_load(const T* const dest,
+                                    MemoryOrder order,
+                                    MemoryScope scope) {
+  return Impl::atomic_fetch_oper(Impl::LoadOper<T, const T>(), const_cast<T*>(dest), T(), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_store(T* const dest,
+                                        const T val,
+                                        MemoryOrder order,
+                                        MemoryScope scope) {
+  (void)Impl::atomic_fetch_oper(Impl::StoreOper<T, const T>(), dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_add(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_add(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_sub(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_sub(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_mul(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_mul(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_div(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_div(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_min(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_min(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_max(T* const dest,
+                                      const T val,
+                                      MemoryOrder order,
+                                      MemoryScope scope) {
+  (void)atomic_fetch_max(dest, val, order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_inc_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_add_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T
+atomic_dec_fetch(T* const dest, MemoryOrder order, MemoryScope scope) {
+  return atomic_sub_fetch(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION T atomic_fetch_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_fetch_sub(dest, T(1), order, scope);
+}
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_inc(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_add(dest, T(1), order, scope);
+}
+
+template <typename T, class MemoryOrder, class MemoryScope>
+DESUL_INLINE_FUNCTION void atomic_dec(T* const dest,
+                                         MemoryOrder order,
+                                         MemoryScope scope) {
+  return atomic_sub(dest, T(1), order, scope);
+}
+
+// FIXME
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_strong(
+    T* const dest,
+    T& expected,
+    T desired,
+    SuccessMemoryOrder success,
+    FailureMemoryOrder /*failure*/,
+    MemoryScope scope) {
+  T const old = atomic_compare_exchange(dest, expected, desired, success, scope);
+  if (old != expected) {
+    expected = old;
+    return false;
+  } else {
+    return true;
+  }
+}
+
+template <typename T,
+          class SuccessMemoryOrder,
+          class FailureMemoryOrder,
+          class MemoryScope>
+DESUL_INLINE_FUNCTION bool atomic_compare_exchange_weak(T* const dest,
+                                                        T& expected,
+                                                        T desired,
+                                                        SuccessMemoryOrder success,
+                                                        FailureMemoryOrder failure,
+                                                        MemoryScope scope) {
+  return atomic_compare_exchange_strong(
+      dest, expected, desired, success, failure, scope);
+}
+
+}  // namespace desul
+
+#include <desul/atomics/SYCL.hpp>
+#include <desul/atomics/CUDA.hpp>
+#include <desul/atomics/GCC.hpp>
+#include <desul/atomics/HIP.hpp>
+#include <desul/atomics/OpenMP.hpp>
+#pragma GCC diagnostic pop
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/HIP.hpp b/lib/kokkos/core/src/desul/atomics/HIP.hpp
new file mode 100644
index 0000000000..16c1f510b7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/HIP.hpp
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_HIP_HPP_
+#define DESUL_ATOMICS_HIP_HPP_
+
+#ifdef __HIP_DEVICE_COMPILE__
+namespace desul {
+namespace Impl {
+template <typename T>
+struct is_hip_atomic_integer_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long int>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_add_type {
+  static constexpr bool value = is_hip_atomic_integer_type<T>::value ||
+                                std::is_same<T, double>::value ||
+                                std::is_same<T, float>::value;
+};
+
+template <typename T>
+struct is_hip_atomic_sub_type {
+  static constexpr bool value =
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value;
+};
+}  // namespace Impl
+
+// Atomic Add
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAdd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAdd(dest, val);
+  __threadfence();
+
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_add_type<T>::value, T>::type
+    atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_add(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Sub
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicSub(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicSub(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_sub_type<T>::value, T>::type
+    atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_sub(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Inc
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicInc(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicInc(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_inc(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_inc(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Dec
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrderRelaxed,
+                                                MemoryScopeDevice) {
+  return atomicDec(dest, val);
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeDevice) {
+  __threadfence();
+  unsigned int return_val = atomicDec(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename MemoryOrder>
+__device__ inline unsigned int atomic_fetch_dec(unsigned int* dest,
+                                                unsigned int val,
+                                                MemoryOrder,
+                                                MemoryScopeCore) {
+  return atomic_fetch_dec(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Max
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMax(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMax(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_max(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic Min
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicMin(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicMin(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_min(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic And
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicAnd(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicAnd(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_and(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic XOR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicXor(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicXor(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_xor(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+// Atomic OR
+template <typename T>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrderRelaxed, MemoryScopeDevice) {
+  return atomicOr(dest, val);
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  __threadfence();
+  T return_val = atomicOr(dest, val);
+  __threadfence();
+  return return_val;
+}
+
+template <typename T, typename MemoryOrder>
+__device__ inline
+    typename std::enable_if<Impl::is_hip_atomic_integer_type<T>::value, T>::type
+    atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeCore) {
+  return atomic_fetch_or(dest, val, MemoryOrder(), MemoryScopeDevice());
+}
+
+}
+
+#define DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MEMORY_ORDER, MEMORY_SCOPE)                 \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_fetch_add(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_fetch_sub(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_and(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_or(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_xor(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_fetch_nand( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_fetch_oper(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_add_type<T>::value, T>::type atomic_add_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AddOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_sub_type<T>::value, T>::type atomic_sub_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::SubOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_and_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::AndOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_or_fetch(   \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::OrOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_xor_fetch(  \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::XorOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }                                                                               \
+  template <typename T>                                                           \
+  __device__ typename std::enable_if<std::is_integral<T>::value && !Impl::is_hip_atomic_integer_type<T>::value, T>::type atomic_nand_fetch( \
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       \
+       return Impl::atomic_oper_fetch(Impl::NandOper<T, const T>(), dest, value, MEMORY_ORDER(), MEMORY_SCOPE()); \
+  }
+namespace desul {
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderRelaxed, MemoryScopeCore)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeNode)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeDevice)
+DESUL_HIP_GCC_INTEGRAL_OP_ATOMICS_COMPATIBILITY(MemoryOrderSeqCst, MemoryScopeCore)
+}  // namespace desul
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp
new file mode 100644
index 0000000000..8fd0e8bbd7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array.hpp
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_HPP_
+
+#include "desul/atomics/Compare_Exchange.hpp"
+#include "desul/atomics/Lock_Array_Cuda.hpp"
+#include "desul/atomics/Lock_Array_HIP.hpp"
+#include "desul/atomics/Macros.hpp"
+
+namespace desul {
+namespace Impl {
+struct host_locks__ {
+  static constexpr uint32_t HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  static constexpr uint32_t HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  template <typename is_always_void = void>
+  static int32_t* get_host_locks_() {
+    static int32_t HOST_SPACE_ATOMIC_LOCKS_DEVICE[HOST_SPACE_ATOMIC_MASK + 1] = {0};
+    return HOST_SPACE_ATOMIC_LOCKS_DEVICE;
+  }
+  static inline int32_t* get_host_lock_(void* ptr) {
+    return &get_host_locks_()[((uint64_t(ptr) >> 2) & HOST_SPACE_ATOMIC_MASK) ^
+                              HOST_SPACE_ATOMIC_XOR_MASK];
+  }
+};
+
+inline void init_lock_arrays() {
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    host_locks__::get_host_locks_();
+    is_initialized = true;
+  }
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  init_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  init_lock_arrays_hip();
+#endif
+}
+
+inline void finalize_lock_arrays() {
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+  finalize_lock_arrays_cuda();
+#endif
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+  finalize_lock_arrays_hip();
+#endif
+}
+template <typename MemoryScope>
+inline bool lock_address(void* ptr, MemoryScope ms) {
+  return 0 == atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                      int32_t(1),
+                                      MemoryOrderSeqCst(),
+                                      ms);
+}
+template <typename MemoryScope>
+void unlock_address(void* ptr, MemoryScope ms) {
+  (void)atomic_exchange(host_locks__::get_host_lock_(ptr),
+                                int32_t(0),
+                                MemoryOrderSeqCst(),
+                                ms);
+}
+}  // namespace Impl
+}  // namespace desul
+
+#endif  // DESUL_ATOMICS_LOCK_ARRAY_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
new file mode 100644
index 0000000000..de99185349
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array_Cuda.hpp
@@ -0,0 +1,172 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+#define DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_
+
+#include "desul/atomics/Macros.hpp"
+#include "desul/atomics/Common.hpp"
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __CUDA_ARCH__
+#define DESUL_IMPL_BALLOT_MASK(m, x) __ballot_sync(m, x)
+#define DESUL_IMPL_ACTIVEMASK __activemask()
+#else
+#define DESUL_IMPL_BALLOT_MASK(m, x) m==0?0:1
+#define DESUL_IMPL_ACTIVEMASK 0
+#endif
+
+/// \brief This global variable in Host space is the central definition
+///        of these arrays.
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h;
+
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snapshotted version while also linking against pure Desul
+template<typename /*AlwaysInt*/ = int>
+void init_lock_arrays_cuda();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_cuda();
+
+}  // namespace Impl
+}  // namespace desul
+
+#if defined(__CUDACC__)
+
+namespace desul {
+namespace Impl {
+
+/// \brief This global variable in CUDA space is what kernels use
+///        to get access to the lock arrays.
+///
+/// When relocatable device code is enabled, there can be one single
+/// instance of this global variable for the entire executable,
+/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
+/// here must then be extern.
+/// This one instance will be initialized by initialize_host_cuda_lock_arrays
+/// and need not be modified afterwards.
+///
+/// When relocatable device code is disabled, an instance of this variable
+/// will be created in every translation unit that sees this header file
+/// (we make this clear by marking it static, meaning no other translation
+///  unit can link to it).
+/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
+/// instances in other translation units, we must update this CUDA global
+/// variable based on the Host global variable prior to running any kernels
+/// that will use it.
+/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef __CUDACC_RDC__
+    __constant__ extern
+#endif
+    int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE;
+
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+__device__ inline bool lock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully acquiring a lock with
+/// lock_address.
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+__device__ inline void unlock_address_cuda(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thingy
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+/* It is critical that this code be a macro, so that it will
+   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
+  {                                                                        \
+    if (::desul::Impl::lock_array_copied == 0) {                           \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
+                         sizeof(int32_t*));                                \
+      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,    \
+                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \
+                         sizeof(int32_t*));                                \
+    }                                                                      \
+    ::desul::Impl::lock_array_copied = 1;                                  \
+  }
+
+
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* defined( KOKKOS_ENABLE_CUDA ) */
+
+#if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
diff --git a/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp b/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
new file mode 100644
index 0000000000..9e6f5e5980
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Lock_Array_HIP.hpp
@@ -0,0 +1,170 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+#define DESUL_ATORMICS_LOCK_ARRAY_HIP_HPP_
+
+#include "desul/atomics/Common.hpp"
+#include "desul/atomics/Macros.hpp"
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+
+namespace desul {
+namespace Impl {
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define DESUL_IMPL_BALLOT_MASK(x) __ballot(x)
+#endif
+
+/**
+ * \brief This global variable in Host space is the central definition of these
+ * arrays.
+ */
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h;
+extern int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h;
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        valid, initialized arrays.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void init_lock_arrays_hip();
+
+/// \brief After this call, the g_host_cuda_lock_arrays variable has
+///        all null pointers, and all array memory has been freed.
+///
+/// This call is idempotent.
+/// The function is templated to make it a weak symbol to deal with Kokkos/RAJA
+///   snappshotted version while also linking against pure Desul
+template<typename T = int>
+void finalize_lock_arrays_hip();
+}  // namespace Impl
+}  // namespace desul
+
+#ifdef __HIPCC__
+namespace desul {
+namespace Impl {
+
+/**
+ * \brief This global variable in HIP space is what kernels use to get access
+ * to the lock arrays.
+ *
+ * When relocatable device code is enabled, there can be one single instance of
+ * this global variable for the entire executable, whose definition will be in
+ * Kokkos_HIP_Locks.cpp (and whose declaration here must then be extern.  This
+ * one instance will be initialized by initialize_host_hip_lock_arrays and need
+ * not be modified afterwards.
+ *
+ * When relocatable device code is disabled, an instance of this variable will
+ * be created in every translation unit that sees this header file (we make this
+ * clear by marking it static, meaning no other translation unit can link to
+ * it). Since the Kokkos_HIP_Locks.cpp translation unit cannot initialize the
+ * instances in other translation units, we must update this CUDA global
+ * variable based on the Host global variable prior to running any kernels that
+ * will use it.  That is the purpose of the
+ * KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE macro.
+ */
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE;
+
+__device__
+#ifdef DESUL_HIP_RDC
+    __constant__ extern
+#endif
+    int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE;
+
+#define HIP_SPACE_ATOMIC_MASK 0x1FFFF
+
+/// \brief Acquire a lock for the address
+///
+/// This function tries to acquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully acquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 1));
+}
+
+__device__ inline bool lock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  return (0 == atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 1));
+}
+
+/**
+ * \brief Release lock for the address
+ *
+ * This function releases the lock for the hash value derived from the provided
+ * ptr. This function should only be called after previously successfully
+ * acquiring a lock with lock_address.
+ */
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeDevice) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[offset], 0);
+}
+
+__device__ inline void unlock_address_hip(void* ptr, desul::MemoryScopeNode) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & HIP_SPACE_ATOMIC_MASK;
+  atomicExch(&desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[offset], 0);
+}
+#endif
+}  // namespace Impl
+}  // namespace desul
+
+// Make lock_array_copied an explicit translation unit scope thing
+namespace desul {
+namespace Impl {
+namespace {
+static int lock_array_copied = 0;
+inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
+}  // namespace
+}  // namespace Impl
+}  // namespace desul
+
+/* It is critical that this code be a macro, so that it will
+   capture the right address for g_device_hip_lock_arrays!
+   putting this in an inline function will NOT do the right thing! */
+#define DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()                               \
+  {                                                                               \
+    if (::desul::Impl::lock_array_copied == 0) {                                  \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE), \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,          \
+                        sizeof(int32_t*));                                        \
+      (void) hipMemcpyToSymbol(HIP_SYMBOL(::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE),   \
+                        &::desul::Impl::HIP_SPACE_ATOMIC_LOCKS_NODE_h,            \
+                        sizeof(int32_t*));                                        \
+    }                                                                             \
+    ::desul::Impl::lock_array_copied = 1;                                         \
+  }
+
+#endif
+
+#if defined(DESUL_HIP_RDC) || (!defined(__HIPCC__))
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE()
+#else
+#define DESUL_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE() \
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE()
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/Macros.hpp b/lib/kokkos/core/src/desul/atomics/Macros.hpp
new file mode 100644
index 0000000000..db9962e03b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/Macros.hpp
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_MACROS_HPP_
+#define DESUL_ATOMICS_MACROS_HPP_
+
+// Macros
+
+#if defined(__GNUC__) && \
+    (!defined(__CUDA_ARCH__) || !defined(__NVCC__)) && \
+    (!defined(__HIP_DEVICE_COMPILE) || !defined(__HIP_PLATFORM_HCC__)) && \
+    !defined(__SYCL_DEVICE_ONLY__) && \
+    !defined(DESUL_HAVE_OPENMP_ATOMICS) && \
+    !defined(DESUL_HAVE_SERIAL_ATOMICS)
+#define DESUL_HAVE_GCC_ATOMICS
+#endif
+
+#ifdef _MSC_VER
+#define DESUL_HAVE_MSVC_ATOMICS
+#endif
+
+#ifdef __CUDACC__
+#define DESUL_HAVE_CUDA_ATOMICS
+#endif
+
+#ifdef __HIPCC__
+#define DESUL_HAVE_HIP_ATOMICS
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define DESUL_HAVE_SYCL_ATOMICS
+#ifdef __clang__
+#define DESUL_SYCL_NAMESPACE sycl::ONEAPI
+#else
+#define DESUL_SYCL_NAMESPACE sycl
+#endif
+#endif
+
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) || defined(__SYCL_DEVICE_ONLY__)
+#define DESUL_HAVE_GPU_LIKE_PROGRESS
+#endif
+
+#if defined(DESUL_HAVE_CUDA_ATOMICS) || defined(DESUL_HAVE_HIP_ATOMICS)
+#define DESUL_FORCEINLINE_FUNCTION inline __host__ __device__
+#define DESUL_INLINE_FUNCTION inline __host__ __device__
+#define DESUL_FUNCTION __host__ __device__
+#else
+#define DESUL_FORCEINLINE_FUNCTION inline
+#define DESUL_INLINE_FUNCTION inline
+#define DESUL_FUNCTION
+#endif
+
+#if !defined(DESUL_HAVE_GPU_LIKE_PROGRESS)
+#define DESUL_HAVE_FORWARD_PROGRESS
+#endif
+
+#endif  // DESUL_ATOMICS_MACROS_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/OpenMP.hpp b/lib/kokkos/core/src/desul/atomics/OpenMP.hpp
new file mode 100644
index 0000000000..3fa22c36ac
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/OpenMP.hpp
@@ -0,0 +1,15 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_OPENMP_HPP_
+#define DESUL_ATOMICS_OPENMP_HPP_
+
+#ifdef DESUL_HAVE_OPENMP_ATOMICS
+
+#include<desul/atomics/openmp/OpenMP_40.hpp>
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/SYCL.hpp b/lib/kokkos/core/src/desul/atomics/SYCL.hpp
new file mode 100644
index 0000000000..44e2dc0ec4
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/SYCL.hpp
@@ -0,0 +1,143 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+#ifndef DESUL_ATOMICS_SYCL_HPP_
+#define DESUL_ATOMICS_SYCL_HPP_
+
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+
+namespace desul {
+namespace Impl {
+template<class T>
+struct is_sycl_atomic_type {
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+				std::is_same<T, long>::value ||
+				std::is_same<T, unsigned long>::value ||
+				std::is_same<T, long long>::value ||
+                                std::is_same<T, unsigned long long int>::value ||
+				std::is_same<T, float>::value ||
+				std::is_same<T, double>::value;
+};
+} // Impl
+
+// Atomic Add
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_add(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T, 
+    DesulToSYCLMemoryOrder<MemoryOrder>::value, 
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,  
+    sycl::access::address_space::global_device_space> 
+  dest_ref(*dest);
+  return dest_ref.fetch_add(val);
+}
+
+// Atomic Sub 
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_sub(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_sub(val);
+}
+
+// Atomic Inc
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_inc(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_add(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Dec
+template<class MemoryOrder/*, class MemoryScope*/>
+inline
+unsigned int atomic_fetch_dec(unsigned int* dest, unsigned int val, MemoryOrder memory_order, MemoryScopeDevice memory_scope) {
+  return atomic_fetch_sub(dest, val, memory_order, memory_scope);
+}
+
+// Atomic Max
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_max(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_max(val);
+}
+
+// Atomic Min
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_min(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_min(val);
+}
+
+// Atomic And
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_and(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_and(val);
+}
+
+// Atomic XOR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_xor(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_xor(val);
+}
+
+// Atomic OR
+template<class T, class MemoryOrder/*, class MemoryScope*/>
+inline
+typename std::enable_if<Impl::is_sycl_atomic_type<T>::value,T>::type
+atomic_fetch_or(T* dest, T val, MemoryOrder, MemoryScopeDevice) {
+  DESUL_SYCL_NAMESPACE::atomic_ref<
+    T,
+    DesulToSYCLMemoryOrder<MemoryOrder>::value,
+    DesulToSYCLMemoryScope<MemoryScopeDevice>::value,
+    sycl::access::address_space::global_device_space>
+  dest_ref(*dest);
+  return dest_ref.fetch_or(val);
+}
+
+} // desul
+#endif  // DESUL_HAVE_SYCL_ATOMICS
+#endif  // DESUL_ATOMICS_SYCL_HPP_
diff --git a/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp b/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp
new file mode 100644
index 0000000000..a66e5cf051
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/SYCLConversions.hpp
@@ -0,0 +1,58 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#define DESUL_ATOMICS_SYCL_CONVERSIONS_HPP_
+#ifdef DESUL_HAVE_SYCL_ATOMICS
+#include "desul/atomics/Common.hpp"
+#include <CL/sycl.hpp>
+
+namespace desul {
+
+template<class MemoryOrder>
+struct DesulToSYCLMemoryOrder;
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderSeqCst> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::seq_cst;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcquire> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acquire;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelease> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::release;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderAcqRel> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::acq_rel;
+};
+template<>
+struct DesulToSYCLMemoryOrder<MemoryOrderRelaxed> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_order value = DESUL_SYCL_NAMESPACE::memory_order::relaxed;
+};
+
+template<class MemoryScope>
+struct DesulToSYCLMemoryScope;
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeCore> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::work_group;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeDevice> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::device;
+};
+template<>
+struct DesulToSYCLMemoryScope<MemoryScopeSystem> {
+  static constexpr DESUL_SYCL_NAMESPACE::memory_scope value = DESUL_SYCL_NAMESPACE::memory_scope::system;
+};
+
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
new file mode 100644
index 0000000000..461d3e0928
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm.hpp
@@ -0,0 +1,18 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+// Choose the variant of atomics we are using later
+#if !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL) && \
+    !defined(DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL)
+#if (__CUDACC_VER_MAJOR__ > 11) || ((__CUDACC_VER_MAJOR__==11) && (__CUDACC_VER_MINOR__>1))
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#else
+#define DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#endif
+#endif
+#include<desul/atomics/cuda/cuda_cc7_asm.inc>
+
+#endif
+}
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
new file mode 100644
index 0000000000..0ab95e6a00
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/CUDA_asm_exchange.hpp
@@ -0,0 +1,8 @@
+#include<limits>
+namespace desul {
+#if defined(__CUDA_ARCH__)  || (defined(__clang__) && !defined(__NVCC__))
+
+#include<desul/atomics/cuda/cuda_cc7_asm_exchange.inc>
+
+#endif
+}
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
new file mode 100644
index 0000000000..2bc64a74b2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
new file mode 100644
index 0000000000..6de590a952
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_fetch_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_fetch_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_fetch_op.inc_forceglobal"
+#endif
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
new file mode 100644
index 0000000000..d00e2223d2
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_forceglobal
@@ -0,0 +1,143 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
new file mode 100644
index 0000000000..364b6a2e4d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_generic
@@ -0,0 +1,142 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops: 
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.and" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.or" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.xor" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile("atom.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile("atom.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
new file mode 100644
index 0000000000..2e8e54062d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_isglobal
@@ -0,0 +1,190 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } else { \
+  asm volatile("atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  } \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } else { \
+  asm volatile("atom.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } else { \
+  asm volatile("atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } else { \
+  asm volatile("atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;" : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  } \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
new file mode 100644
index 0000000000..5f53279daf
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc_predicate
@@ -0,0 +1,226 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// binary operations
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_and(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.and.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.and"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_or(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.or.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.or"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_fetch_xor(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.xor.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "@!p atom.xor"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+// Fetch atomics
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_add(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_sub(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  ctype neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(neg_value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_min(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_max(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result=0; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(value) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_inc(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.inc.gobal" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.inc"       __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+inline __device__ ctype atomic_fetch_dec(ctype* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  ctype result = 0; \
+  ctype limit = desul::Impl::numeric_limits_max<ctype>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %1;\n\t" \
+          "@p  atom.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "@!p atom.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_ctype " %0,[%1],%2;\n\t" \
+          "}\n\t" \
+    : reg_ret_ctype(result) : "l"(dest),reg_ctype(limit) : "memory"); \
+  return result; \
+}
+
+// Group ops for integer ctypes
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN(ctype,asm_ctype,reg_ctype,reg_ret_ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX(ctype,asm_ctype,reg_ctype,reg_ret_ctype)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_OR(ctype) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_XOR(ctype)
+
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(float,".f32","f","=f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD(double,".f64","d","=d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_SUB(double,".f64","d","=d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_UNSIGNED_OP(uint64_t,".u64","l","=l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int32_t,".s32","r","=r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INTEGER_OP(int64_t,".s64","l","=l")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC(uint32_t,".u32","r","=r")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC(uint32_t,".u32","r","=r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_BIN_OP()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_ADD
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MIN
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_MAX
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_INC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_DEC
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_FETCH_AND
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
new file mode 100644
index 0000000000..ca02410515
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc
@@ -0,0 +1,18 @@
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_GENERIC
+#include "cuda_cc7_asm_atomic_op.inc_generic"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_ISGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_isglobal"
+#endif
+
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_PREDICATE
+#include "cuda_cc7_asm_atomic_op.inc_predicate"
+#endif
+
+// This version is not generally safe
+// Only here for performance comparison purposes
+#ifdef DESUL_IMPL_ATOMIC_CUDA_PTX_FORCEGLOBAL
+#include "cuda_cc7_asm_atomic_op.inc_forceglobal"
+#endif
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
new file mode 100644
index 0000000000..3767b2ab49
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_forceglobal
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
new file mode 100644
index 0000000000..5de36a3e0a
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_generic
@@ -0,0 +1,64 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile("red.add" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.min" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile("red.max" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.inc" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile("red.dec" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
new file mode 100644
index 0000000000..ba89378834
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_isglobal
@@ -0,0 +1,88 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } else { \
+  asm volatile("red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(neg_value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } else { \
+  asm volatile("red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(value) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  if(__isGlobal(dest)) { \
+  asm volatile("red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } else { \
+  asm volatile("red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;" :: "l"(dest),reg_type(limit) : "memory"); \
+  } \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
new file mode 100644
index 0000000000..46e0ccf5e7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc_predicate
@@ -0,0 +1,106 @@
+
+// Inline PTX: h u16 , r u32,  l u64, f f32, d f64
+// Ops:
+
+// Non Returning Atomic Operations
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+inline __device__ void atomic_add(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+inline __device__ void atomic_sub(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type neg_value = -value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.add.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.add"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(neg_value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+inline __device__ void atomic_min(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.min.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.min"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+inline __device__ void atomic_max(type* dest, type value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.max.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.max"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(value) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+inline __device__ void atomic_inc(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.inc.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.inc"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type) \
+inline __device__ void atomic_dec(type* dest, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  type limit = desul::Impl::numeric_limits_max<type>::value; \
+  asm volatile( \
+          "{\n\t" \
+          ".reg .pred p;\n\t" \
+          "isspacep.global p, %0;\n\t" \
+          "@p  red.dec.global" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "@!p red.dec"        __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM asm_type " [%0],%1;\n\t" \
+          "}\n\t" \
+    :: "l"(dest),reg_type(limit) : "memory"); \
+}
+
+// Group ops for integer types
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type)
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MIN(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_MAX(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INC(type,asm_type,reg_type) \
+__DESUL_IMPL_CUDA_ASM_ATOMIC_DEC(type,asm_type,reg_type)
+
+// Instantiate Functions
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(float,".f32","f")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_ADD(double,".f64","d")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_SUB(double,".f64","d")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_UNSIGNED_OP(uint32_t,".u32","r")
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".u64","l")
+__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int32_t,".s32","r")
+//__DESUL_IMPL_CUDA_ASM_ATOMIC_INTEGER_OP(int64_t,".s64","l")
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
new file mode 100644
index 0000000000..dfd211249f
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange.inc
@@ -0,0 +1,20 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeDevice
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".gpu"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeNode
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".sys"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE MemoryScopeCore
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".cta"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
new file mode 100644
index 0000000000..7b4f7d094e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_memorder.inc
@@ -0,0 +1,27 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
new file mode 100644
index 0000000000..51d992087e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_exchange_op.inc
@@ -0,0 +1,40 @@
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2;" : "=r"(asm_result) : "l"(dest),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_exchange(ctype* dest, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.exch" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2;" : "=l"(asm_result) : "l"(dest),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+#define __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE() \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==4, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint32_t asm_value = reinterpret_cast<uint32_t&>(value); \
+  uint32_t asm_compare = reinterpret_cast<uint32_t&>(compare); \
+  uint32_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b32" " %0,[%1],%2,%3;" : "=r"(asm_result) : "l"(dest),"r"(asm_compare),"r"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+} \
+template<class ctype> \
+inline __device__ typename std::enable_if<sizeof(ctype)==8, ctype>::type atomic_compare_exchange(ctype* dest, ctype compare, ctype value, __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER, __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE) { \
+  uint64_t asm_value = reinterpret_cast<uint64_t&>(value); \
+  uint64_t asm_compare = reinterpret_cast<uint64_t&>(compare); \
+  uint64_t asm_result = 0u; \
+  asm volatile("atom.cas" __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM __DESUL_IMPL_CUDA_ASM_MEMORY_SCOPE_ASM ".b64" " %0,[%1],%2,%3;" : "=l"(asm_result) : "l"(dest),"l"(asm_compare),"l"(asm_value) : "memory"); \
+  return reinterpret_cast<ctype&>(asm_result); \
+}
+
+__DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE()
+__DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE()
+
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_EXCHANGE
+#undef __DESUL_IMPL_CUDA_ASM_ATOMIC_COMPARE_EXCHANGE
diff --git a/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
new file mode 100644
index 0000000000..3eb613d8a7
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/cuda/cuda_cc7_asm_memorder.inc
@@ -0,0 +1,29 @@
+
+// Non returning atomic operation (ptx red instruction) only exists for relaxed and release memorder
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelaxed
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".relaxed"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderRelease
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".release"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcquire
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acquire"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER MemoryOrderAcqRel
+#define __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM ".acq_rel"
+#include "desul/atomics/cuda/cuda_cc7_asm_atomic_fetch_op.inc"
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER
+#undef __DESUL_IMPL_CUDA_ASM_MEMORY_ORDER_ASM
+
diff --git a/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
new file mode 100644
index 0000000000..f4f1bbd96e
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40.hpp
@@ -0,0 +1,97 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#ifndef DESUL_ATOMICS_OPENMP40_HPP_
+#define DESUL_ATOMICS_OPENMP40_HPP_
+#include<type_traits>
+
+namespace desul {
+namespace Impl {
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcquire, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcquire(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_pre_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class MEMORY_ORDER_TMP, class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MEMORY_ORDER_TMP, MEMORY_SCOPE_TMP) {}
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderRelease, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderRelease(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderAcqRel, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderAcqRel(), MEMORY_SCOPE_TMP());
+  }
+  template<class MEMORY_SCOPE_TMP>
+  void openmp_maybe_call_post_capture_flush(MemoryOrderSeqCst, MEMORY_SCOPE_TMP) {
+    atomic_thread_fence(MemoryOrderSeqCst(), MEMORY_SCOPE_TMP());
+  }
+
+  template<class T>
+  struct is_openmp_atomic_type_t {
+    static constexpr bool value = std::is_arithmetic<T>::value;
+  };
+  template<class T>
+  constexpr bool is_openmp_atomic_type_v = is_openmp_atomic_type_t<T>::value;
+}
+}
+
+namespace desul {
+// Can't use a macro approach to get all definitions since the ops include #pragma omp
+// So gonna use multiple inclusion of the same code snippet here.
+
+// Can't do Node level atomics this way with OpenMP Target, but we could 
+// have a define which says whether or not Device level IS node level (e.g. for pure CPU node)
+
+#define MEMORY_ORDER MemoryOrderRelaxed
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderAcqRel
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+
+#define MEMORY_ORDER MemoryOrderSeqCst
+// #define MEMORY_SCOPE MemoryScopeNode
+// #include<desul/atomics/openmp/OpenMP_40_op.inc>
+// #undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeDevice
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#define MEMORY_SCOPE MemoryScopeCore
+#include<desul/atomics/openmp/OpenMP_40_op.inc>
+#undef MEMORY_SCOPE
+#undef MEMORY_ORDER
+}  // namespace desul
+#endif
diff --git a/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
new file mode 100644
index 0000000000..a65f2a457d
--- /dev/null
+++ b/lib/kokkos/core/src/desul/atomics/openmp/OpenMP_40_op.inc
@@ -0,0 +1,101 @@
+
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_add(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest += value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_sub(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest -= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_and(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest &= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_or(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest |= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_fetch_xor(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { tmp = *dest;  *dest ^= value; }                                             
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_add_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest += value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_sub_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest -= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_and_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest &= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_or_fetch(   
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest |= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }                                                                               
+  template <typename T>                                                           
+  std::enable_if_t<Impl::is_openmp_atomic_type_v<T>,T> atomic_xor_fetch(  
+      T* const dest, T value, MEMORY_ORDER, MEMORY_SCOPE) {                       
+    T tmp;                                                                        
+    Impl::openmp_maybe_call_pre_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());    
+    #pragma omp atomic capture                                                    
+    { *dest ^= value; tmp = *dest; }                                              
+    Impl::openmp_maybe_call_post_capture_flush(MEMORY_ORDER(), MEMORY_SCOPE());   
+    return tmp;                                                                   
+  }
diff --git a/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp b/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
new file mode 100644
index 0000000000..8913f8bc7b
--- /dev/null
+++ b/lib/kokkos/core/src/desul/src/Lock_Array_CUDA.cpp
@@ -0,0 +1,98 @@
+/* 
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <desul/atomics/Lock_Array.hpp>
+#include <cinttypes>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_CUDA_ATOMICS
+#ifdef __CUDACC_RDC__
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_cuda_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < CUDA_SPACE_ATOMIC_MASK + 1) {
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_cuda(cudaError e, const std::string msg) {
+  if(e != cudaSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << cudaGetErrorName(e)
+                  << "): " << cudaGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+// define functions
+template<typename T>
+void init_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+  auto error_malloc1 = cudaMalloc(&CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc1, "init_lock_arrays_cuda: cudaMalloc device locks");
+
+  auto error_malloc2 = cudaMallocHost(&CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                                 sizeof(int32_t) * (CUDA_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_cuda(error_malloc2, "init_lock_arrays_cuda: cudaMalloc host locks");
+
+  auto error_sync1 = cudaDeviceSynchronize();
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
+  init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+  auto error_sync2 = cudaDeviceSynchronize();
+  check_error_and_throw_cuda(error_sync2, "init_lock_arrays_cuda: post init kernel");
+}
+
+template<typename T>
+void finalize_lock_arrays_cuda() {
+  if (CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  cudaFree(CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  cudaFreeHost(CUDA_SPACE_ATOMIC_LOCKS_NODE_h);
+  CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef __CUDACC_RDC__
+  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+// Instantiate functions
+template void init_lock_arrays_cuda<int>();
+template void finalize_lock_arrays_cuda<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
diff --git a/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp b/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
new file mode 100644
index 0000000000..40030df643
--- /dev/null
+++ b/lib/kokkos/core/src/desul/src/Lock_Array_HIP.cpp
@@ -0,0 +1,101 @@
+/*
+Copyright (c) 2019, Lawrence Livermore National Security, LLC
+and DESUL project contributors. See the COPYRIGHT file for details.
+Source: https://github.com/desul/desul
+
+SPDX-License-Identifier: (BSD-3-Clause)
+*/
+
+#include <cinttypes>
+#include <desul/atomics/Lock_Array.hpp>
+#include <string>
+#include <sstream>
+
+#ifdef DESUL_HAVE_HIP_ATOMICS
+#ifdef DESUL_HIP_RDC
+namespace desul {
+namespace Impl {
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE = nullptr;
+__device__ __constant__ int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE = nullptr;
+}  // namespace Impl
+}  // namespace desul
+#endif
+
+namespace desul {
+
+namespace {
+
+__global__ void init_lock_arrays_hip_kernel() {
+  unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < HIP_SPACE_ATOMIC_MASK + 1) {
+    Impl::HIP_SPACE_ATOMIC_LOCKS_DEVICE[i] = 0;
+    Impl::HIP_SPACE_ATOMIC_LOCKS_NODE[i] = 0;
+  }
+}
+
+}  // namespace
+
+namespace Impl {
+
+int32_t* HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+int32_t* HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+
+// Putting this into anonymous namespace so we don't have multiple defined symbols
+// When linking in more than one copy of the object file
+namespace {
+
+void check_error_and_throw_hip(hipError_t e, const std::string msg) {
+  if(e != hipSuccess) {
+    std::ostringstream out;
+    out << "Desul::Error: " << msg << " error(" << hipGetErrorName(e)
+                  << "): " << hipGetErrorString(e);
+    throw std::runtime_error(out.str());
+  }
+}
+
+}
+
+template<typename T>
+void init_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h != nullptr) return;
+
+  auto error_malloc1 = hipMalloc(&HIP_SPACE_ATOMIC_LOCKS_DEVICE_h,
+            sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc1, "init_lock_arrays_hip: hipMalloc device locks");
+
+  auto error_malloc2 = hipHostMalloc(&HIP_SPACE_ATOMIC_LOCKS_NODE_h,
+                sizeof(int32_t) * (HIP_SPACE_ATOMIC_MASK + 1));
+  check_error_and_throw_hip(error_malloc2, "init_lock_arrays_hip: hipMallocHost host locks");
+
+  auto error_sync1 = hipDeviceSynchronize();
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+  check_error_and_throw_hip(error_sync1, "init_lock_arrays_hip: post malloc");
+
+  init_lock_arrays_hip_kernel<<<(HIP_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
+
+  auto error_sync2 = hipDeviceSynchronize();
+  check_error_and_throw_hip(error_sync2, "init_lock_arrays_hip: post init");
+}
+
+template<typename T>
+void finalize_lock_arrays_hip() {
+  if (HIP_SPACE_ATOMIC_LOCKS_DEVICE_h == nullptr) return;
+  auto error_free1 = hipFree(HIP_SPACE_ATOMIC_LOCKS_DEVICE_h);
+  check_error_and_throw_hip(error_free1, "finalize_lock_arrays_hip: free device locks");
+  auto error_free2 = hipHostFree(HIP_SPACE_ATOMIC_LOCKS_NODE_h);
+  check_error_and_throw_hip(error_free2, "finalize_lock_arrays_hip: free host locks");
+  HIP_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
+  HIP_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
+#ifdef DESUL_HIP_RDC
+  DESUL_IMPL_COPY_HIP_LOCK_ARRAYS_TO_DEVICE();
+#endif
+}
+
+template void init_lock_arrays_hip<int>();
+template void finalize_lock_arrays_hip<int>();
+
+}  // namespace Impl
+
+}  // namespace desul
+#endif
+
diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
index 7754daa8a0..0ce680cd69 100644
--- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
+++ b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp
@@ -52,6 +52,8 @@ class SYCLDeviceUSMSpace;  ///< Memory space on SYCL device, not accessible from
                            ///< the host
 class SYCLSharedUSMSpace;  ///< Memory space accessible from both the SYCL
                            ///< device and the host
+class SYCLHostUSMSpace;    ///< Memory space accessible from both the SYCL
+                           ///< device and the host (host pinned)
 class SYCL;                ///< Execution space for SYCL
 }  // namespace Experimental
 }  // namespace Kokkos
diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
index 7f72b3983f..5167c9ed65 100644
--- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
+++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
@@ -1518,28 +1518,14 @@ struct Tile_Loop_Type<
 };
 // end Structs for calling loops
 
-template <typename T>
-using is_void_type = std::is_same<T, void>;
-
-template <typename T>
-struct is_type_array : std::false_type {
-  using value_type = T;
-};
-
-template <typename T>
-struct is_type_array<T[]> : std::true_type {
-  using value_type = T;
-};
-
 template <typename RP, typename Functor, typename Tag = void,
           typename ValueType = void, typename Enable = void>
 struct HostIterateTile;
 
 // For ParallelFor
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<is_void_type<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<std::is_void<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -1947,10 +1933,9 @@ struct HostIterateTile<
 // For ParallelReduce
 // ValueType - scalar: For reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            !is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        !std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
@@ -2370,17 +2355,16 @@ struct HostIterateTile<
 // Extra specialization for array reductions
 // ValueType[]: For array reductions
 template <typename RP, typename Functor, typename Tag, typename ValueType>
-struct HostIterateTile<
-    RP, Functor, Tag, ValueType,
-    typename std::enable_if<!is_void_type<ValueType>::value &&
-                            is_type_array<ValueType>::value>::type> {
+struct HostIterateTile<RP, Functor, Tag, ValueType,
+                       std::enable_if_t<!std::is_void<ValueType>::value &&
+                                        std::is_array<ValueType>::value>> {
   using index_type = typename RP::index_type;
   using point_type = typename RP::point_type;
 
   using value_type =
-      typename is_type_array<ValueType>::value_type;  // strip away the
-                                                      // 'array-ness' [], only
-                                                      // underlying type remains
+      std::remove_extent_t<ValueType>;  // strip away the
+                                        // 'array-ness' [], only
+                                        // underlying type remains
 
   inline HostIterateTile(
       RP const& rp, Functor const& func,
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
index c513817b5b..20fc6268c7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp
@@ -63,12 +63,39 @@
 namespace Kokkos {
 namespace Impl {
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// <editor-fold desc="AnalyzePolicyBaseTraits"> {{{1
 
-using execution_policy_trait_specifications =
-    type_list<ExecutionSpaceTrait, GraphKernelTrait, IndexTypeTrait,
-              IterationPatternTrait, LaunchBoundsTrait, OccupancyControlTrait,
-              ScheduleTrait, WorkItemPropertyTrait, WorkTagTrait>;
+// Mix in the defaults (base_traits) for the traits that aren't yet handled
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="MSVC EBO failure workaround"> {{{2
+
+template <class TraitSpecList>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
+template <class... TraitSpecifications>
+struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
+    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
+    : TraitSpecifications::base_traits... {};
+
+// </editor-fold> end AnalyzePolicyBaseTraits }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+
+//------------------------------------------------------------------------------
+// Note: unspecialized, so that the default pathway is to fall back to using
+// the PolicyTraitMatcher. See AnalyzeExecPolicyUseMatcher below
+template <class Enable, class... Traits>
+struct AnalyzeExecPolicy
+    : AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...> {
+  using base_t =
+      AnalyzeExecPolicyUseMatcher<void, execution_policy_trait_specifications,
+                                  Traits...>;
+  using base_t::base_t;
+};
 
 //------------------------------------------------------------------------------
 // Ignore void for backwards compatibility purposes, though hopefully no one is
@@ -81,15 +108,6 @@ struct AnalyzeExecPolicy<void, void, Traits...>
 };
 
 //------------------------------------------------------------------------------
-// Mix in the defaults (base_traits) for the traits that aren't yet handled
-
-template <class TraitSpecList>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION AnalyzeExecPolicyBaseTraits;
-template <class... TraitSpecifications>
-struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION
-    AnalyzeExecPolicyBaseTraits<type_list<TraitSpecifications...>>
-    : TraitSpecifications::base_traits... {};
-
 template <>
 struct AnalyzeExecPolicy<void>
     : AnalyzeExecPolicyBaseTraits<execution_policy_trait_specifications> {
@@ -108,6 +126,68 @@ struct AnalyzeExecPolicy<void>
   }
 };
 
+// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+//==============================================================================
+
+//==============================================================================
+// <editor-fold desc="AnalyzeExecPolicyUseMatcher"> {{{1
+
+// We can avoid having to have policies specialize AnalyzeExecPolicy themselves
+// by piggy-backing off of the PolicyTraitMatcher that we need to have for
+// things like require() anyway. We mixin the effects of the trait using
+// the `mixin_matching_trait` nested alias template in the trait specification
+
+// General PolicyTraitMatcher version
+
+// Matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : TraitSpec::template mixin_matching_trait<
+          Trait, AnalyzeExecPolicy<void, Traits...>> {
+  using base_t = typename TraitSpec::template mixin_matching_trait<
+      Trait, AnalyzeExecPolicy<void, Traits...>>;
+  using base_t::base_t;
+};
+
+// Non-matching case
+template <class TraitSpec, class... TraitSpecs, class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<
+    std::enable_if_t<!PolicyTraitMatcher<TraitSpec, Trait>::value>,
+    type_list<TraitSpec, TraitSpecs...>, Trait, Traits...>
+    : AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>, Trait,
+                                  Traits...> {
+  using base_t = AnalyzeExecPolicyUseMatcher<void, type_list<TraitSpecs...>,
+                                             Trait, Traits...>;
+  using base_t::base_t;
+};
+
+// No match found case:
+template <class>
+struct show_name_of_invalid_execution_policy_trait;
+template <class Trait, class... Traits>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>, Trait, Traits...> {
+  static constexpr auto trigger_error_message =
+      show_name_of_invalid_execution_policy_trait<Trait>{};
+  static_assert(
+      /* always false: */ std::is_void<Trait>::value,
+      "Unknown execution policy trait. Search compiler output for "
+      "'show_name_of_invalid_execution_policy_trait' to see the type of the "
+      "invalid trait.");
+};
+
+// All traits matched case:
+template <>
+struct AnalyzeExecPolicyUseMatcher<void, type_list<>>
+    : AnalyzeExecPolicy<void> {
+  using base_t = AnalyzeExecPolicy<void>;
+  using base_t::base_t;
+};
+
+// </editor-fold> end AnalyzeExecPolicyUseMatcher }}}1
+//==============================================================================
+
 //------------------------------------------------------------------------------
 // Used for defaults that depend on other analysis results
 template <class AnalysisResults>
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
index dd571eb6d7..d481a8dc0f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -51,10 +51,6 @@
     !defined(KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP)
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 #include <impl/Kokkos_Atomic_Memory_Order.hpp>
 #include <impl/Kokkos_Memory_Fence.hpp>
 
@@ -115,13 +111,9 @@ __inline__ __device__ T atomic_compare_exchange(
                             const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -134,11 +126,7 @@ __inline__ __device__ T atomic_compare_exchange(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
index bbea3c99b8..4bb8b4fd52 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Weak.hpp
@@ -51,10 +51,6 @@
 #ifndef KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_WEAK_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
index f2c1c756a9..cd840983d8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_EXCHANGE_HPP)
 #define KOKKOS_ATOMIC_EXCHANGE_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -122,13 +118,9 @@ atomic_exchange(volatile T* const dest,
   _mm_prefetch((const char*)dest, _MM_HINT_ET0);
 #endif
 
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -141,11 +133,7 @@ atomic_exchange(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
index 5c3f825ed1..9a2b13debc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_ADD_HPP)
 #define KOKKOS_ATOMIC_FETCH_ADD_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -148,13 +144,9 @@ atomic_fetch_add(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -169,11 +161,7 @@ atomic_fetch_add(volatile T* const dest,
       }
     }
 
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
index c3446ae6a3..148ed97442 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -50,10 +50,6 @@
 #if defined(KOKKOS_ATOMIC_HPP) && !defined(KOKKOS_ATOMIC_FETCH_SUB_HPP)
 #define KOKKOS_ATOMIC_FETCH_SUB_HPP
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
@@ -143,13 +139,9 @@ atomic_fetch_sub(volatile T* const dest,
                                          const T>::type& val) {
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
-  int done = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask   = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  int done                 = 0;
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -162,11 +154,7 @@ atomic_fetch_sub(volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
index 28ac7a3bab..f6bdbca729 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -47,10 +47,6 @@
 #define KOKKOS_ATOMIC_GENERIC_HPP
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_CUDA)
-#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>
-#endif
-
 // Combination operands to be used in an Compare and Exchange based atomic
 // operation
 namespace Kokkos {
@@ -301,12 +297,8 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
   // This is a way to (hopefully) avoid dead lock in a warp
   T return_val;
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -319,11 +311,7 @@ KOKKOS_INLINE_FUNCTION T atomic_fetch_oper(
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
@@ -377,12 +365,8 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
   T return_val;
   // This is a way to (hopefully) avoid dead lock in a warp
   int done                 = 0;
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-  unsigned int mask        = KOKKOS_IMPL_CUDA_ACTIVEMASK;
-  unsigned int active      = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, 1);
-#else
-  unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1);
-#endif
+  unsigned int mask        = __activemask();
+  unsigned int active      = __ballot_sync(mask, 1);
   unsigned int done_active = 0;
   while (active != done_active) {
     if (!done) {
@@ -395,11 +379,7 @@ atomic_oper_fetch(const Oper& op, volatile T* const dest,
         done = 1;
       }
     }
-#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
-    done_active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask, done);
-#else
-    done_active = KOKKOS_IMPL_CUDA_BALLOT(done);
-#endif
+    done_active = __ballot_sync(mask, done);
   }
   return return_val;
 #elif defined(__HIP_DEVICE_COMPILE__)
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
index 975318b7dd..f763f8c791 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -339,9 +339,8 @@ class AtomicDataElement {
   }
 
   KOKKOS_INLINE_FUNCTION
-  operator volatile non_const_value_type() volatile const {
-    // return Kokkos::atomic_load(ptr);
-    return *ptr;
+  operator non_const_value_type() volatile const {
+    return Kokkos::Impl::atomic_load(ptr);
   }
 };
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
index 4e46b8d157..87f18604da 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
@@ -55,7 +55,7 @@
 // To use OpenCL(TM) built-in intrinsics inside kernels, we have to
 // forward-declare their prototype, also see
 // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md
-#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
 extern SYCL_EXTERNAL unsigned long __attribute__((overloadable))
 intel_get_cycle_counter();
@@ -85,7 +85,7 @@ uint64_t clock_tic() noexcept {
 
   return clock64();
 
-#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GEN) && \
+#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \
     defined(__SYCL_DEVICE_ONLY__)
   return intel_get_cycle_counter();
 #elif defined(KOKKOS_ENABLE_OPENMPTARGET)
diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
index 06681a95ae..4ec8513191 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp
@@ -76,17 +76,17 @@ struct CombinedReducerValueItemImpl {
       CombinedReducerValueItemImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl(
       CombinedReducerValueItemImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerValueItemImpl&
-  operator=(CombinedReducerValueItemImpl&&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl const&) = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=(
+      CombinedReducerValueItemImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueItemImpl() = default;
   explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value)
       : m_value(std::move(arg_value)) {}
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 value_type& ref() & noexcept { return m_value; }
+  constexpr value_type& ref() & noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
   constexpr value_type const& ref() const& noexcept { return m_value; }
   KOKKOS_FORCEINLINE_FUNCTION
@@ -112,11 +112,11 @@ struct CombinedReducerValueImpl<std::integer_sequence<size_t, Idxs...>,
   KOKKOS_DEFAULTED_FUNCTION
   constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
+  constexpr CombinedReducerValueImpl& operator=(
       CombinedReducerValueImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReducerValueImpl& operator=(
-      CombinedReducerValueImpl&&) = default;
+  constexpr CombinedReducerValueImpl& operator=(CombinedReducerValueImpl&&) =
+      default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReducerValueImpl() = default;
 
@@ -165,20 +165,19 @@ struct CombinedReducerStorageImpl {
   // model Reducer
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _init(value_type& val) const {
+  constexpr _fold_comma_emulation_return _init(value_type& val) const {
     m_reducer.init(val);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type& dest, value_type const& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type& dest, value_type const& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
 
-  KOKKOS_INLINE_FUNCTION KOKKOS_CONSTEXPR_14 _fold_comma_emulation_return
-  _join(value_type volatile& dest, value_type const volatile& src) const {
+  KOKKOS_INLINE_FUNCTION constexpr _fold_comma_emulation_return _join(
+      value_type volatile& dest, value_type const volatile& src) const {
     m_reducer.join(dest, src);
     return _fold_comma_emulation_return{};
   }
@@ -242,10 +241,10 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
       CombinedReducerImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl(
-      CombinedReducerImpl&&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+      CombinedReducerImpl&&)                                       = default;
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl const&) = default;
-  KOKKOS_DEFAULTED_FUNCTION KOKKOS_CONSTEXPR_14 CombinedReducerImpl& operator=(
+  KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=(
       CombinedReducerImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default;
@@ -257,9 +256,8 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
                                                        reducers)...,
         m_value_view(&value) {}
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void join(value_type& dest,
-                                                value_type const& src) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void join(value_type& dest,
+                                      value_type const& src) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_join(
             dest.template get<Idxs, typename Reducers::value_type>(),
@@ -274,8 +272,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
             src.template get<Idxs, typename Reducers::value_type>())...);
   }
 
-  KOKKOS_FUNCTION KOKKOS_CONSTEXPR_14 void init(value_type& dest) const
-      noexcept {
+  KOKKOS_FUNCTION constexpr void init(value_type& dest) const noexcept {
     emulate_fold_comma_operator(
         this->CombinedReducerStorageImpl<Idxs, Reducers>::_init(
             dest.template get<Idxs, typename Reducers::value_type>())...);
@@ -298,7 +295,7 @@ struct CombinedReducerImpl<std::integer_sequence<size_t, Idxs...>, Space,
   }
 
   KOKKOS_FUNCTION
-  KOKKOS_CONSTEXPR_14 static void write_value_back_to_original_references(
+  constexpr static void write_value_back_to_original_references(
       value_type const& value,
       Reducers const&... reducers_that_reference_original_values) noexcept {
     emulate_fold_comma_operator(
@@ -360,10 +357,10 @@ struct CombinedReductionFunctorWrapperImpl<
   constexpr CombinedReductionFunctorWrapperImpl(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl const&) = default;
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14 CombinedReductionFunctorWrapperImpl& operator=(
+  constexpr CombinedReductionFunctorWrapperImpl& operator=(
       CombinedReductionFunctorWrapperImpl&&) = default;
   KOKKOS_DEFAULTED_FUNCTION
   ~CombinedReductionFunctorWrapperImpl() = default;
@@ -551,7 +548,7 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
                      ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   // Since we don't support asynchronous combined reducers yet for various
   // reasons, we actually just want to work with the pointers and references
@@ -581,8 +578,11 @@ auto parallel_reduce(std::string const& label, PolicyType const& policy,
 
   reduce_adaptor_t::execute(label, policy, combined_functor, combined_reducer);
   Impl::ParallelReduceFence<typename PolicyType::execution_space,
-                            combined_reducer_type>::fence(policy.space(),
-                                                          combined_reducer);
+                            combined_reducer_type>::
+      fence(
+          policy.space(),
+          "Kokkos::parallel_reduce: fence due to result being value, not view",
+          combined_reducer);
   combined_reducer.write_value_back_to_original_references(
       value, Impl::_make_reducer_from_arg<space_type>(returnType1),
       Impl::_make_reducer_from_arg<space_type>(returnType2),
@@ -596,7 +596,7 @@ auto parallel_reduce(PolicyType const& policy, Functor const& functor,
                      ReturnType1&& returnType1, ReturnType2&& returnType2,
                      ReturnTypes&&... returnTypes) noexcept ->
     typename std::enable_if<
-        Kokkos::Impl::is_execution_policy<PolicyType>::value>::type {
+        Kokkos::is_execution_policy<PolicyType>::value>::type {
   //----------------------------------------
   Kokkos::parallel_reduce("", policy, functor,
                           std::forward<ReturnType1>(returnType1),
diff --git a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
index c02f4acdda..dafe57f8da 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
@@ -138,15 +138,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -222,15 +222,15 @@ struct concurrent_bitset {
     // when is full at the atomic_fetch_add(+1)
     // then a release occurs before the atomic_fetch_add(-1).
 
-    const uint32_t state =
-        (uint32_t)Kokkos::atomic_fetch_add((volatile int *)buffer, 1);
+    const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add(
+        reinterpret_cast<volatile int *>(buffer), 1);
 
     const uint32_t state_error = state_header != (state & state_header_mask);
 
     const uint32_t state_bit_used = state & state_used_mask;
 
     if (state_error || (bit_bound <= state_bit_used)) {
-      Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+      Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
       return state_error ? type(-2, -2) : type(-1, -1);
     }
 
@@ -300,7 +300,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     // Flush the store-release
     Kokkos::memory_fence();
@@ -336,7 +337,8 @@ struct concurrent_bitset {
     // Do not update count until bit clear is visible
     Kokkos::memory_fence();
 
-    const int count = Kokkos::atomic_fetch_add((volatile int *)buffer, -1);
+    const int count =
+        Kokkos::atomic_fetch_add(reinterpret_cast<volatile int *>(buffer), -1);
 
     return (count & state_used_mask) - 1;
   }
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index b4769fbeaa..a1f9d33632 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -130,6 +130,11 @@ void ExecSpaceManager::static_fence() {
     to_fence.second->fence();
   }
 }
+void ExecSpaceManager::static_fence(const std::string& name) {
+  for (auto& to_fence : exec_space_factory_list) {
+    to_fence.second->fence(name);
+  }
+}
 void ExecSpaceManager::print_configuration(std::ostream& msg,
                                            const bool detail) {
   for (auto& to_print : exec_space_factory_list) {
@@ -506,11 +511,6 @@ void pre_initialize_internal(const InitArguments& args) {
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "yes");
 #else
   declare_configuration_metadata("options", "KOKKOS_ENABLE_LIBRT", "no");
-#endif
-#ifdef KOKKOS_ENABLE_MPI
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "yes");
-#else
-  declare_configuration_metadata("options", "KOKKOS_ENABLE_MPI", "no");
 #endif
   declare_configuration_metadata("architecture", "Default Device",
                                  typeid(Kokkos::DefaultExecutionSpace).name());
@@ -564,7 +564,9 @@ void finalize_internal(const bool all_spaces = false) {
   g_tune_internals = false;
 }
 
-void fence_internal() { Impl::ExecSpaceManager::get_instance().static_fence(); }
+void fence_internal(const std::string& name) {
+  Impl::ExecSpaceManager::get_instance().static_fence(name);
+}
 
 bool check_arg(char const* arg, char const* expected) {
   std::size_t arg_len = std::strlen(arg);
@@ -1092,7 +1094,8 @@ void finalize_all() {
   Impl::finalize_internal(all_spaces);
 }
 
-void fence() { Impl::fence_internal(); }
+void fence() { Impl::fence_internal("Kokkos::fence: Unnamed Global Fence"); }
+void fence(const std::string& name) { Impl::fence_internal(name); }
 
 void print_helper(std::ostringstream& out,
                   const std::map<std::string, std::string>& print_me) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
index a124511c07..dc8e5e4d83 100644
--- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp
@@ -79,20 +79,6 @@ struct EBOBaseImpl;
 
 template <class T, template <class...> class CtorNotOnDevice>
 struct EBOBaseImpl<T, true, CtorNotOnDevice> {
-  /*
-   * Workaround for constexpr in C++11: we need to still call T(args...), but we
-   * can't do so in the body of a constexpr function (in C++11), and there's no
-   * data member to construct into. But we can construct into an argument
-   * of a delegating constructor...
-   */
-  // TODO @minor DSH the destructor gets called too early with this workaround
-  struct _constexpr_14_workaround_tag {};
-  struct _constexpr_14_workaround_no_device_tag {};
-  KOKKOS_FORCEINLINE_FUNCTION
-  constexpr EBOBaseImpl(_constexpr_14_workaround_tag, T&&) noexcept {}
-  inline constexpr EBOBaseImpl(_constexpr_14_workaround_no_device_tag,
-                               T&&) noexcept {}
-
   template <
       class... Args, class _ignored = void,
       typename std::enable_if<std::is_void<_ignored>::value &&
@@ -100,10 +86,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   !CtorNotOnDevice<Args...>::value,
                               int>::type = 0>
   KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl(
-      Args&&... args) noexcept(noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_tag{},
-                    T(std::forward<Args>(args)...)) {}
+      Args&&...) noexcept {}
 
   template <
       class... Args, class _ignored = void,
@@ -111,11 +94,7 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
                                   std::is_constructible<T, Args...>::value &&
                                   CtorNotOnDevice<Args...>::value,
                               long>::type = 0>
-  inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept(
-      noexcept(T(std::forward<Args>(args)...)))
-      // still call the constructor
-      : EBOBaseImpl(_constexpr_14_workaround_no_device_tag{},
-                    T(std::forward<Args>(args)...)) {}
+  inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {}
 
   KOKKOS_DEFAULTED_FUNCTION
   constexpr EBOBaseImpl(EBOBaseImpl const&) = default;
@@ -124,19 +103,16 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
+  constexpr T& _ebo_data_member() & { return *reinterpret_cast<T*>(this); }
 
   KOKKOS_INLINE_FUNCTION
   constexpr T const& _ebo_data_member() const& {
@@ -154,8 +130,9 @@ struct EBOBaseImpl<T, true, CtorNotOnDevice> {
   }
 
   KOKKOS_INLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& _ebo_data_member() && { return std::move(*reinterpret_cast<T*>(this)); }
+  constexpr T&& _ebo_data_member() && {
+    return std::move(*reinterpret_cast<T*>(this));
+  }
 };
 
 template <class T, template <class...> class CTorsNotOnDevice>
@@ -191,12 +168,10 @@ struct EBOBaseImpl<T, false, CTorsNotOnDevice> {
   constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
+  constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default;
 
   KOKKOS_DEFAULTED_FUNCTION
   ~EBOBaseImpl() = default;
@@ -232,8 +207,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   using ebo_base_t::ebo_base_t;
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T& no_unique_address_data_member() & {
+  constexpr T& no_unique_address_data_member() & {
     return this->ebo_base_t::_ebo_data_member();
   }
 
@@ -253,8 +227,7 @@ struct StandardLayoutNoUniqueAddressMemberEmulation
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  KOKKOS_CONSTEXPR_14
-  T&& no_unique_address_data_member() && {
+  constexpr T&& no_unique_address_data_member() && {
     return this->ebo_base_t::_ebo_data_member();
   }
 };
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
index dfb9f3a51c..9c8024cbd0 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Error.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -138,6 +138,9 @@ void Experimental::RawMemoryAllocationFailure::print_error_message(
     case AllocationMechanism::SYCLMallocShared:
       o << "sycl::malloc_shared().";
       break;
+    case AllocationMechanism::SYCLMallocHost:
+      o << "sycl::malloc_host().";
+      break;
   }
   append_additional_error_information(o);
   o << ")" << std::endl;
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
index 5db4597346..dc9bfe2b5a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Error.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -97,7 +97,8 @@ class RawMemoryAllocationFailure : public std::bad_alloc {
     HIPMalloc,
     HIPHostMalloc,
     SYCLMallocDevice,
-    SYCLMallocShared
+    SYCLMallocShared,
+    SYCLMallocHost
   };
 
  private:
@@ -218,31 +219,41 @@ KOKKOS_IMPL_ABORT_NORETURN KOKKOS_INLINE_FUNCTION void abort(
 
 #if !defined(NDEBUG) || defined(KOKKOS_ENFORCE_CONTRACTS) || \
     defined(KOKKOS_ENABLE_DEBUG)
-#define KOKKOS_EXPECTS(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Expected precondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_EXPECTS(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Expected precondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-#define KOKKOS_ENSURES(...)                                               \
-  {                                                                       \
-    if (!bool(__VA_ARGS__)) {                                             \
-      ::Kokkos::abort(                                                    \
-          "Kokkos contract violation:\n  "                                \
-          "  Ensured postcondition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                     \
+#define KOKKOS_ENSURES(...)                                                    \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Ensured postcondition `" #__VA_ARGS__                             \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
-// some projects already define this for themselves, so don't mess them up
+// some projects already define this for themselves, so don't mess
+// them up
 #ifndef KOKKOS_ASSERT
-#define KOKKOS_ASSERT(...)                                             \
-  {                                                                    \
-    if (!bool(__VA_ARGS__)) {                                          \
-      ::Kokkos::abort(                                                 \
-          "Kokkos contract violation:\n  "                             \
-          "  Asserted condition `" #__VA_ARGS__ "` evaluated false."); \
-    }                                                                  \
+#define KOKKOS_ASSERT(...)                                                     \
+  {                                                                            \
+    if (!bool(__VA_ARGS__)) {                                                  \
+      ::Kokkos::abort(                                                         \
+          "Kokkos contract violation:\n  "                                     \
+          "  Asserted condition `" #__VA_ARGS__                                \
+          "` evaluated false.\n"                                               \
+          "Error at " KOKKOS_IMPL_TOSTRING(__FILE__) ":" KOKKOS_IMPL_TOSTRING( \
+              __LINE__) " \n");                                                \
+    }                                                                          \
   }
 #endif  // ifndef KOKKOS_ASSERT
 #else   // not debug mode
diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
index a922e7e3f9..1a0b10e40f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceInitializer.hpp
@@ -55,6 +55,7 @@ class ExecSpaceInitializerBase {
   virtual void initialize(const InitArguments &args)                     = 0;
   virtual void finalize(const bool all_spaces)                           = 0;
   virtual void fence()                                                   = 0;
+  virtual void fence(const std::string &)                                = 0;
   virtual void print_configuration(std::ostream &msg, const bool detail) = 0;
   ExecSpaceInitializerBase()          = default;
   virtual ~ExecSpaceInitializerBase() = default;
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
index 22e88ebc4f..5de92fc457 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -1335,7 +1334,10 @@ struct FunctorValueTraits<FunctorType, ArgTag,
   using functor_type = FunctorType;
 
   static_assert(
-      IS_VOID || IS_REJECT || 0 == (sizeof(ValueType) % sizeof(int)),
+      IS_VOID || IS_REJECT ||
+          ((sizeof(ValueType) > sizeof(int))
+               ? 0 == sizeof(ValueType) % sizeof(int)
+               : true),
       "Reduction functor's value_type deduced from functor::operator() "
       "requires: 0 == sizeof(value_type) % sizeof(int)");
 
@@ -1902,17 +1904,6 @@ struct FunctorFinalFunction {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type&));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile & ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile & )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile & ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const&) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1925,17 +1916,6 @@ struct FunctorFinalFunction {
                                                         value_type const&));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const&));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile & ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile & ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile & ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile & ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile & ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile & ) );
 };
 
 // Compatible functions for 'final' function and value_type is an array
@@ -1956,17 +1936,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type*));
 
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * )
-  // const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type volatile * ) ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (             *)( ArgTag         , value_type volatile * )
-  // ); KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)(
-  // ArgTag const & , value_type volatile * ) );
-
   KOKKOS_INLINE_FUNCTION static void enable_if(
       void (FunctorType::*)(ArgTag, value_type const*) const);
   KOKKOS_INLINE_FUNCTION static void enable_if(
@@ -1979,17 +1948,6 @@ struct FunctorFinalFunction<FunctorType, ArgTag, true> {
                                                         value_type const*));
   KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
                                                         value_type const*));
-
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // , value_type const volatile * ) const ); KOKKOS_INLINE_FUNCTION static void
-  // enable_if( void (FunctorType::*)( ArgTag const & , value_type const
-  // volatile * ) const ); KOKKOS_INLINE_FUNCTION static void enable_if( void
-  // (FunctorType::*)( ArgTag         , value_type const volatile * ) );
-  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag
-  // const & , value_type const volatile * ) ); KOKKOS_INLINE_FUNCTION static
-  // void enable_if( void (             *)( ArgTag         , value_type const
-  // volatile * ) ); KOKKOS_INLINE_FUNCTION static void enable_if( void ( *)(
-  // ArgTag const & , value_type const volatile * ) );
 };
 
 template <class FunctorType>
@@ -2109,89 +2067,4 @@ struct FunctorFinal<
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
-namespace Kokkos {
-namespace Impl {
-
-template <class FunctorType, class ArgTag,
-          class ReferenceType =
-              typename FunctorValueTraits<FunctorType, ArgTag>::reference_type>
-struct FunctorApplyFunction {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ArgTag const&, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag, ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ArgTag const&,
-                                                        ReferenceType));
-};
-
-template <class FunctorType, class ReferenceType>
-struct FunctorApplyFunction<FunctorType, void, ReferenceType> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType) const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(
-      void (FunctorType::*)(ReferenceType));
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (*)(ReferenceType));
-};
-
-template <class FunctorType>
-struct FunctorApplyFunction<FunctorType, void, void> {
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)() const);
-  KOKKOS_INLINE_FUNCTION static void enable_if(void (FunctorType::*)());
-};
-
-template <class FunctorType, class ArgTag, class ReferenceType,
-          class Enable = void>
-struct FunctorApply {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType&, void*) {}
-};
-
-/* 'apply' function provided for void value */
-template <class FunctorType, class ArgTag>
-struct FunctorApply<
-    FunctorType, ArgTag,
-    void
-    // First  substitution failure when FunctorType::apply does not exist.
-    // Second substitution failure when enable_if( & Functor::apply ) does not
-    // exist
-    ,
-    decltype(FunctorApplyFunction<FunctorType, ArgTag, void>::enable_if(
-        &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f) { f.apply(); }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f) {
-    f.apply();
-  }
-};
-
-/* 'apply' function provided for single value */
-template <class FunctorType, class ArgTag, class T>
-struct FunctorApply<FunctorType, ArgTag,
-                    T&
-                    // First  substitution failure when FunctorType::apply does
-                    // not exist. Second substitution failure when enable_if( &
-                    // Functor::apply ) does not exist
-                    ,
-                    decltype(
-                        FunctorApplyFunction<FunctorType, ArgTag>::enable_if(
-                            &FunctorType::apply))> {
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(const FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-
-  KOKKOS_FORCEINLINE_FUNCTION static void apply(FunctorType& f, void* p) {
-    f.apply(*((T*)p));
-  }
-};
-
-}  // namespace Impl
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
 #endif /* KOKKOS_FUNCTORADAPTER_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
index a56d19ee72..7140154e0f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
@@ -48,7 +48,6 @@
 #include <cstddef>
 #include <Kokkos_Core_fwd.hpp>
 #include <impl/Kokkos_Traits.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
@@ -722,14 +721,16 @@ struct FunctorAnalysis {
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<IsArray, FunctorAnalysis::ValueType*>::type
+        typename std::enable_if<IsArray,
+                                typename FunctorAnalysis::ValueType*>::type
         ref() const noexcept {
       return m_result;
     }
 
     template <bool IsArray>
     KOKKOS_INLINE_FUNCTION constexpr
-        typename std::enable_if<!IsArray, FunctorAnalysis::ValueType&>::type
+        typename std::enable_if<!IsArray,
+                                typename FunctorAnalysis::ValueType&>::type
         ref() const noexcept {
       return *m_result;
     }
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp b/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
index 97286dd07f..3b7b194db5 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSharedPtr.hpp
@@ -47,6 +47,7 @@
 
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
 
 #include <functional>
 
@@ -92,6 +93,8 @@ class HostSharedPtr {
     // FIXME_OPENMPTARGET requires something like KOKKOS_IMPL_IF_ON_HOST
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
     if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+    m_control = nullptr;
 #endif
   }
 
@@ -115,6 +118,8 @@ class HostSharedPtr {
       // FIXME_OPENMPTARGET
 #ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
       if (m_control) Kokkos::atomic_add(&(m_control->m_counter), 1);
+#else
+      m_control = nullptr;
 #endif
     }
     return *this;
@@ -154,6 +159,9 @@ class HostSharedPtr {
     // object pointed to by m_counter and m_element_ptr.
     if (m_control) {
       int const count = Kokkos::atomic_fetch_sub(&(m_control->m_counter), 1);
+      // atomic_fetch_sub might have memory order relaxed so we need to force
+      // synchronization to avoid multiple threads doing the cleanup.
+      Kokkos::memory_fence();
       if (count == 1) {
         (m_control->m_deleter)(m_element_ptr);
         m_element_ptr = nullptr;
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
index 2e5587e4a3..a7f4a652be 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
@@ -74,8 +74,8 @@ void HostThreadTeamData::organize_pool(HostThreadTeamData *members[],
     }
 
     {
-      HostThreadTeamData **const pool =
-          (HostThreadTeamData **)(root_scratch + m_pool_members);
+      HostThreadTeamData **const pool = reinterpret_cast<HostThreadTeamData **>(
+          root_scratch + m_pool_members);
 
       // team size == 1, league size == pool_size
 
@@ -136,7 +136,8 @@ int HostThreadTeamData::organize_team(const int team_size) {
     if (team_size == 1) return 1;  // Already organized in teams of one
 
     HostThreadTeamData *const *const pool =
-        (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+        reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                m_pool_members);
 
     // "league_size" in this context is the number of concurrent teams
     // that the pool can accommodate.  Excess threads are idle.
@@ -239,7 +240,8 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (w.first == -1 && m_steal_rank != m_pool_rank) {
       HostThreadTeamData *const *const pool =
-          (HostThreadTeamData **)(m_pool_scratch + m_pool_members);
+          reinterpret_cast<HostThreadTeamData **>(m_pool_scratch +
+                                                  m_pool_members);
 
       // Attempt from beginning failed, try to steal from end of neighbor
 
@@ -287,23 +289,17 @@ int HostThreadTeamData::get_work_stealing() noexcept {
 
     if (1 < m_team_size) {
       // Must share the work index
-      *((int volatile *)team_reduce()) = w.first;
+      *reinterpret_cast<int volatile *>(team_reduce()) = w.first;
 
       team_rendezvous_release();
     }
   } else if (1 < m_team_size) {
-    w.first = *((int volatile *)team_reduce());
+    w.first = *reinterpret_cast<int volatile *>(team_reduce());
   }
 
   // May exit because successfully stole work and w is good.
   // May exit because no work left to steal and w = (-1,-1).
 
-#if 0
-fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
-       , m_pool_rank , m_pool_size , w.first );
-fflush(stdout);
-#endif
-
   return w.first;
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
index d4cae7f122..0652b55bb7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
@@ -91,9 +91,18 @@ class HostThreadTeamData {
   //   [ thread_local ]     = [ m_thread_local    .. m_scratch_size )
 
   enum : int { m_pool_members = 0 };
-  enum : int { m_pool_rendezvous = m_pool_members + max_pool_members };
-  enum : int { m_team_rendezvous = m_pool_rendezvous + max_pool_rendezvous };
-  enum : int { m_pool_reduce = m_team_rendezvous + max_team_rendezvous };
+  enum : int {
+    m_pool_rendezvous =
+        static_cast<int>(m_pool_members) + static_cast<int>(max_pool_members)
+  };
+  enum : int {
+    m_team_rendezvous = static_cast<int>(m_pool_rendezvous) +
+                        static_cast<int>(max_pool_rendezvous)
+  };
+  enum : int {
+    m_pool_reduce = static_cast<int>(m_team_rendezvous) +
+                    static_cast<int>(max_team_rendezvous)
+  };
 
   using pair_int_t = Kokkos::pair<int64_t, int64_t>;
 
@@ -120,13 +129,13 @@ class HostThreadTeamData {
   int mutable m_team_rendezvous_step;
 
   HostThreadTeamData* team_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch +
-                                   m_pool_members))[m_team_base + r];
+    return (reinterpret_cast<HostThreadTeamData**>(
+        m_pool_scratch + m_pool_members))[m_team_base + r];
   }
 
  public:
   inline bool team_rendezvous() const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != 0) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -138,7 +147,7 @@ class HostThreadTeamData {
   }
 
   inline bool team_rendezvous(const int source_team_rank) const noexcept {
-    int* ptr = (int*)(m_team_scratch + m_team_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous);
     HostBarrier::split_arrive(ptr, m_team_size, m_team_rendezvous_step);
     if (m_team_rank != source_team_rank) {
       HostBarrier::wait(ptr, m_team_size, m_team_rendezvous_step);
@@ -150,12 +159,13 @@ class HostThreadTeamData {
   }
 
   inline void team_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_team_scratch + m_team_rendezvous),
-                               m_team_size, m_team_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_team_scratch + m_team_rendezvous), m_team_size,
+        m_team_rendezvous_step);
   }
 
   inline int pool_rendezvous() const noexcept {
-    int* ptr = (int*)(m_pool_scratch + m_pool_rendezvous);
+    int* ptr = reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous);
     HostBarrier::split_arrive(ptr, m_pool_size, m_pool_rendezvous_step);
     if (m_pool_rank != 0) {
       HostBarrier::wait(ptr, m_pool_size, m_pool_rendezvous_step);
@@ -167,8 +177,9 @@ class HostThreadTeamData {
   }
 
   inline void pool_rendezvous_release() const noexcept {
-    HostBarrier::split_release((int*)(m_pool_scratch + m_pool_rendezvous),
-                               m_pool_size, m_pool_rendezvous_step);
+    HostBarrier::split_release(
+        reinterpret_cast<int*>(m_pool_scratch + m_pool_rendezvous), m_pool_size,
+        m_pool_rendezvous_step);
   }
 
   //----------------------------------------
@@ -230,7 +241,8 @@ class HostThreadTeamData {
   constexpr int pool_size() const { return m_pool_size; }
 
   HostThreadTeamData* pool_member(int r) const noexcept {
-    return ((HostThreadTeamData**)(m_pool_scratch + m_pool_members))[r];
+    return (reinterpret_cast<HostThreadTeamData**>(m_pool_scratch +
+                                                   m_pool_members))[r];
   }
 
   //----------------------------------------
@@ -330,24 +342,11 @@ class HostThreadTeamData {
     team_shared_size = align_to_int64(team_shared_size);
     // thread_local_size = align_to_int64( thread_local_size );
 
-    m_scratch      = (int64_t*)alloc_ptr;
+    m_scratch      = static_cast<int64_t*>(alloc_ptr);
     m_team_reduce  = m_pool_reduce + pool_reduce_size;
     m_team_shared  = m_team_reduce + team_reduce_size;
     m_thread_local = m_team_shared + team_shared_size;
     m_scratch_size = align_to_int64(alloc_size);
-
-#if 0
-fprintf(stdout,"HostThreadTeamData::scratch_assign { %d %d %d %d %d %d %d }\n"
-       , int(m_pool_members)
-       , int(m_pool_rendezvous)
-       , int(m_pool_reduce)
-       , int(m_team_reduce)
-       , int(m_team_shared)
-       , int(m_thread_local)
-       , int(m_scratch_size)
-       );
-fflush(stdout);
-#endif
   }
 
   //----------------------------------------
diff --git a/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp b/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
index 79aeca5da0..1ed502db5b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_LinkedListNode.hpp
@@ -110,7 +110,7 @@ struct SimpleSinglyLinkedListNode {
   friend struct LinkedListNodeAccess;
 
  public:
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
@@ -118,7 +118,7 @@ struct SimpleSinglyLinkedListNode {
     return m_next != reinterpret_cast<pointer_type>(NotEnqueuedValue);
   }
 
-  // KOKKOS_CONSTEXPR_14
+  // constexpr
   KOKKOS_INLINE_FUNCTION
   bool is_enqueued() const volatile noexcept {
     // TODO @tasking @memory_order DSH make this an atomic load with memory
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
index 76d5536019..865d1c47fa 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -48,7 +48,7 @@
 namespace Kokkos {
 
 //----------------------------------------------------------------------------
-
+#ifndef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
 KOKKOS_FORCEINLINE_FUNCTION
 void memory_fence() {
 #if defined(__CUDA_ARCH__)
@@ -75,6 +75,7 @@ void memory_fence() {
 #error "Error: memory_fence() not defined"
 #endif
 }
+#endif
 
 //////////////////////////////////////////////////////
 // store_fence()
diff --git a/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
index fe78cfbacc..1c61b73f02 100644
--- a/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_MultipleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -467,7 +466,7 @@ class MultipleTaskQueue final
 
   // TODO @tasking @generalization DSH make this a property-based customization
   // point
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       typename base_t::execution_space const& exec_space,
       typename base_t::memory_space const&,
       typename base_t::memory_pool const&) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 94ea6e1a2b..8505e8f51a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -53,6 +53,7 @@
 #include <array>
 #include <cstring>
 #include <iostream>
+#include <memory>
 #include <stack>
 #include <unordered_map>
 #include <unordered_set>
@@ -70,7 +71,9 @@ void tool_invoked_fence(const uint32_t /* devID */) {
    * Eventually we want to support fencing only
    * a given stream/resource
    */
-  Kokkos::fence();
+  Kokkos::fence(
+      "Kokkos::Tools::Experimental::Impl::tool_invoked_fence: Tool Requested "
+      "Fence");
 }
 }  // namespace Impl
 #ifdef KOKKOS_ENABLE_TUNING
@@ -131,7 +134,8 @@ inline void invoke_kokkosp_callback(
     if (may_require_global_fencing == MayRequireGlobalFencing::Yes &&
         (Kokkos::Tools::Experimental::tool_requirements
              .requires_global_fencing)) {
-      Kokkos::fence();
+      Kokkos::fence(
+          "Kokkos::Tools::invoke_kokkosp_callback: Kokkos Profile Tool Fence");
     }
     (*callback)(std::forward<Args>(args)...);
   }
@@ -432,18 +436,43 @@ void initialize(const std::string& profileLibrary) {
   if (is_initialized) return;
   is_initialized = 1;
 
+  auto invoke_init_callbacks = []() {
+    Experimental::invoke_kokkosp_callback(
+        Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
+        Kokkos::Tools::Experimental::current_callbacks.init, 0,
+        (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
+
+    Experimental::tool_requirements.requires_global_fencing = true;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.request_tool_settings, 1,
+        &Experimental::tool_requirements);
+
+    Experimental::ToolProgrammingInterface actions;
+    actions.fence = &Experimental::Impl::tool_invoked_fence;
+
+    Experimental::invoke_kokkosp_callback(
+        Experimental::MayRequireGlobalFencing::No,
+        Experimental::current_callbacks.provide_tool_programming_interface, 1,
+        actions);
+  };
+
 #ifdef KOKKOS_ENABLE_LIBDL
   void* firstProfileLibrary = nullptr;
 
-  if (profileLibrary.empty()) return;
+  if (profileLibrary.empty()) {
+    invoke_init_callbacks();
+    return;
+  }
 
   char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
-  char* envProfileCopy =
-      (char*)malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
-  sprintf(envProfileCopy, "%s", envProfileLibrary);
+  const auto envProfileCopy =
+      std::make_unique<char[]>(strlen(envProfileLibrary) + 1);
+  sprintf(envProfileCopy.get(), "%s", envProfileLibrary);
 
-  char* profileLibraryName = strtok(envProfileCopy, ";");
+  char* profileLibraryName = strtok(envProfileCopy.get(), ";");
 
   if ((profileLibraryName != nullptr) &&
       (strcmp(profileLibraryName, "") != 0)) {
@@ -574,25 +603,8 @@ void initialize(const std::string& profileLibrary) {
 #else
   (void)profileLibrary;
 #endif  // KOKKOS_ENABLE_LIBDL
-  Experimental::invoke_kokkosp_callback(
-      Kokkos::Tools::Experimental::MayRequireGlobalFencing::No,
-      Kokkos::Tools::Experimental::current_callbacks.init, 0,
-      (uint64_t)KOKKOSP_INTERFACE_VERSION, (uint32_t)0, nullptr);
 
-  Experimental::tool_requirements.requires_global_fencing = true;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.request_tool_settings, 1,
-      &Experimental::tool_requirements);
-
-  Experimental::ToolProgrammingInterface actions;
-  actions.fence = &Experimental::Impl::tool_invoked_fence;
-
-  Experimental::invoke_kokkosp_callback(
-      Experimental::MayRequireGlobalFencing::No,
-      Experimental::current_callbacks.provide_tool_programming_interface, 1,
-      actions);
+  invoke_init_callbacks();
 
 #ifdef KOKKOS_ENABLE_TUNING
   Experimental::VariableInfo kernel_name;
@@ -656,9 +668,6 @@ void initialize(const std::string& profileLibrary) {
   Experimental::no_profiling.declare_output_type   = nullptr;
   Experimental::no_profiling.request_output_values = nullptr;
   Experimental::no_profiling.end_tuning_context    = nullptr;
-#ifdef KOKKOS_ENABLE_LIBDL
-  free(envProfileCopy);
-#endif
 }
 
 void finalize() {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
index 1ff6a36c3b..86a4cfa4a8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.hpp
@@ -50,9 +50,12 @@
 #include <Kokkos_Macros.hpp>
 #include <Kokkos_Tuners.hpp>
 #include <impl/Kokkos_Profiling_Interface.hpp>
+#include <memory>
+#include <unordered_map>
 #include <map>
 #include <string>
 #include <type_traits>
+#include <mutex>
 namespace Kokkos {
 
 // forward declaration
@@ -135,6 +138,71 @@ Kokkos_Profiling_SpaceHandle make_space_handle(const char* space_name);
 
 namespace Experimental {
 
+namespace Impl {
+struct DirectFenceIDHandle {
+  uint32_t value;
+};
+//
+template <typename Space>
+uint32_t idForInstance(const uintptr_t instance) {
+  static std::mutex instance_mutex;
+  const std::lock_guard<std::mutex> lock(instance_mutex);
+  /** Needed to be a ptr due to initialization order problems*/
+  using map_type = std::map<uintptr_t, uint32_t>;
+
+  static std::shared_ptr<map_type> map;
+  if (map.get() == nullptr) {
+    map = std::make_shared<map_type>(map_type());
+  }
+
+  static uint32_t value = 0;
+  constexpr const uint32_t offset =
+      Kokkos::Tools::Experimental::NumReservedDeviceIDs;
+
+  auto find = map->find(instance);
+  if (find == map->end()) {
+    auto ret         = offset + value++;
+    (*map)[instance] = ret;
+    return ret;
+  }
+
+  return find->second;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(const std::string& name, DirectFenceIDHandle devIDTag,
+                         const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name,
+      Kokkos::Tools::Experimental::device_id_root<Space>() + devIDTag.value,
+      &handle);
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+
+inline uint32_t int_for_synchronization_reason(
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason) {
+  switch (reason) {
+    case GlobalDeviceSynchronization: return 0;
+    case DeepCopyResourceSynchronization: return 0x00ffffff;
+  }
+  return 0;
+}
+
+template <typename Space, typename FencingFunctor>
+void profile_fence_event(
+    const std::string& name,
+    Kokkos::Tools::Experimental::SpecialSynchronizationCases reason,
+    const FencingFunctor& func) {
+  uint64_t handle = 0;
+  Kokkos::Tools::beginFence(
+      name, device_id_root<Space>() + int_for_synchronization_reason(reason),
+      &handle);  // TODO: correct ID
+  func();
+  Kokkos::Tools::endFence(handle);
+}
+}  // namespace Impl
 void set_init_callback(initFunction callback);
 void set_finalize_callback(finalizeFunction callback);
 void set_parse_args_callback(parseArgsFunction callback);
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
index ed8751c50c..2c8d1428fc 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h
@@ -54,7 +54,7 @@
 #include <stdbool.h>
 #endif
 
-#define KOKKOSP_INTERFACE_VERSION 20210225
+#define KOKKOSP_INTERFACE_VERSION 20210623
 
 // Profiling
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
index 7809632f78..a7aec2e6fd 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -56,6 +56,14 @@
 namespace Kokkos {
 namespace Tools {
 namespace Experimental {
+
+constexpr const uint32_t NumReservedDeviceIDs = 1;
+
+enum SpecialSynchronizationCases : int {
+  GlobalDeviceSynchronization     = 1,
+  DeepCopyResourceSynchronization = 2,
+};
+
 enum struct DeviceType {
   Serial,
   OpenMP,
@@ -68,15 +76,49 @@ enum struct DeviceType {
   Unknown
 };
 
+struct ExecutionSpaceIdentifier {
+  DeviceType type;
+  uint32_t device_id;
+  uint32_t instance_id;
+};
+inline DeviceType devicetype_from_uint32t(const uint32_t in) {
+  switch (in) {
+    case 0: return DeviceType::Serial;
+    case 1: return DeviceType::OpenMP;
+    case 2: return DeviceType::Cuda;
+    case 3: return DeviceType::HIP;
+    case 4: return DeviceType::OpenMPTarget;
+    case 5: return DeviceType::HPX;
+    case 6: return DeviceType::Threads;
+    case 7: return DeviceType::SYCL;
+    default: return DeviceType::Unknown;  // TODO: error out?
+  }
+}
+
+inline ExecutionSpaceIdentifier identifier_from_devid(const uint32_t in) {
+  // ExecutionSpaceIdentifier out;
+  // out.type = in >> 24;
+  // out.device_id = in >> 17;
+  // out.instance_id = ((uint32_t(-1)) << 17 ) & in;
+  return {devicetype_from_uint32t(in >> 24),
+          (~((uint32_t(-1)) << 24)) & (in >> 17),
+          (~((uint32_t(-1)) << 17)) & in};
+}
+
 template <typename ExecutionSpace>
 struct DeviceTypeTraits;
 
 constexpr const size_t device_type_bits = 8;
 constexpr const size_t instance_bits    = 24;
 template <typename ExecutionSpace>
+constexpr uint32_t device_id_root() {
+  constexpr auto device_id =
+      static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
+  return (device_id << instance_bits);
+}
+template <typename ExecutionSpace>
 inline uint32_t device_id(ExecutionSpace const& space) noexcept {
-  auto device_id = static_cast<uint32_t>(DeviceTypeTraits<ExecutionSpace>::id);
-  return (device_id << instance_bits) + space.impl_instance_id();
+  return device_id_root<ExecutionSpace>() + space.impl_instance_id();
 }
 }  // namespace Experimental
 }  // namespace Tools
diff --git a/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
new file mode 100644
index 0000000000..b67cede45b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp
@@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QUAD_PRECISION_MATH_HPP
+#define KOKKOS_QUAD_PRECISION_MATH_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#if defined(KOKKOS_ENABLE_LIBQUADMATH)
+
+#include <Kokkos_NumericTraits.hpp>
+
+#include <quadmath.h>
+
+#if !(defined(__FLOAT128__) || defined(__SIZEOF_FLOAT128__))
+#error __float128 not supported on this host
+#endif
+
+//<editor-fold desc="numeric traits __float128 specializations">
+namespace Kokkos {
+namespace Experimental {
+#if defined(KOKKOS_ENABLE_CXX17)
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };                                                                         \
+  template <>                                                                \
+  inline constexpr auto TRAIT##_v<TYPE> = TRAIT<TYPE>::value;
+#else
+#define KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(TRAIT, TYPE, VALUE_TYPE, VALUE) \
+  template <>                                                                \
+  struct TRAIT<TYPE> {                                                       \
+    static constexpr VALUE_TYPE value = VALUE;                               \
+  };
+#endif
+
+// clang-format off
+// Numeric distinguished value traits
+// Workaround GCC bug https://godbolt.org/z/qWb5oe4dx
+// error: '__builtin_huge_valq()' is not a constant expression
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 710)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(infinity,       __float128, __float128, HUGE_VALQ)
+#endif
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_min,     __float128, __float128, -FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(finite_max,     __float128, __float128, FLT128_MAX)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(epsilon,        __float128, __float128, FLT128_EPSILON)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(round_error,    __float128, __float128, static_cast<__float128>(0.5))
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(norm_min,       __float128, __float128, FLT128_MIN)
+
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits,         __float128,        int, FLT128_MANT_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(digits10,       __float128,        int, FLT128_DIG)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_digits10,   __float128,        int, 36)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(radix,          __float128,        int, 2)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent,   __float128,        int, FLT128_MIN_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent,   __float128,        int, FLT128_MAX_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(min_exponent10, __float128,        int, FLT128_MIN_10_EXP)
+KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT(max_exponent10, __float128,        int, FLT128_MAX_10_EXP)
+// clang-format on
+
+#undef KOKKOS_IMPL_SPECIALIZE_NUMERIC_TRAIT
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+namespace Kokkos {
+template <>
+struct reduction_identity<__float128> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 sum() {
+    return static_cast<__float128>(0.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 prod() {
+    return static_cast<__float128>(1.0);
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 max() {
+    return -FLT128_MAX;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static __float128 min() {
+    return FLT128_MAX;
+  }
+};
+}  // namespace Kokkos
+
+//<editor-fold desc="Common mathematical functions __float128 overloads">
+namespace Kokkos {
+namespace Experimental {
+// clang-format off
+// Basic operations
+inline __float128 fabs(__float128 x) { return ::fabsq(x); }
+inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); }
+inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); }
+inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); }
+inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); }
+inline __float128 fdim(__float128 x, __float128 y) { return ::fdimq(x, y); }
+inline __float128 nanq(char const* arg) { return ::nanq(arg); }
+// Power functions
+inline __float128 pow(__float128 x, __float128 y) { return ::powq(x, y); }
+inline __float128 sqrt(__float128 x) { return ::sqrtq(x); }
+inline __float128 cbrt(__float128 x) { return ::cbrtq(x); }
+inline __float128 hypot(__float128 x, __float128 y) { return ::hypotq(x, y); }
+// Exponential functions
+inline __float128 exp(__float128 x) { return ::expq(x); }
+#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU >= 910)
+inline __float128 exp2(__float128 x) { return ::exp2q(x); }
+#endif
+inline __float128 expm1(__float128 x) { return ::expm1q(x); }
+inline __float128 log(__float128 x) { return ::logq(x); }
+inline __float128 log10(__float128 x) { return ::log10q(x); }
+inline __float128 log2(__float128 x) { return ::log2q(x); }
+inline __float128 log1p(__float128 x) { return ::log1pq(x); }
+// Trigonometric functions
+inline __float128 sin(__float128 x) { return ::sinq(x); }
+inline __float128 cos(__float128 x) { return ::cosq(x); }
+inline __float128 tan(__float128 x) { return ::tanq(x); }
+inline __float128 asin(__float128 x) { return ::asinq(x); }
+inline __float128 acos(__float128 x) { return ::acosq(x); }
+inline __float128 atan(__float128 x) { return ::atanq(x); }
+inline __float128 atan2(__float128 x, __float128 y) { return ::atan2q(x, y); }
+// Hyperbolic functions
+inline __float128 sinh(__float128 x) { return ::sinhq(x); }
+inline __float128 cosh(__float128 x) { return ::coshq(x); }
+inline __float128 tanh(__float128 x) { return ::tanhq(x); }
+inline __float128 asinh(__float128 x) { return ::asinhq(x); }
+inline __float128 acosh(__float128 x) { return ::acoshq(x); }
+inline __float128 atanh(__float128 x) { return ::atanhq(x); }
+// Error and gamma functions
+inline __float128 erf(__float128 x) { return ::erfq(x); }
+inline __float128 erfc(__float128 x) { return ::erfcq(x); }
+inline __float128 tgamma(__float128 x) { return ::tgammaq(x); }
+inline __float128 lgamma(__float128 x) { return ::lgammaq(x); }
+// Nearest integer floating point operations
+inline __float128 ceil(__float128 x) { return ::ceilq(x); }
+inline __float128 floor(__float128 x) { return ::floorq(x); }
+inline __float128 trunc(__float128 x) { return ::truncq(x); }
+inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); }
+// Classification and comparison
+inline bool isfinite(__float128 x) { return !::isinfq(x); }  // isfiniteq not provided
+inline bool isinf(__float128 x) { return ::isinfq(x); }
+inline bool isnan(__float128 x) { return ::isnanq(x); }
+}  // namespace Experimental
+}  // namespace Kokkos
+//</editor-fold>
+
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
index 4bd0379065..c49e838d8f 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -58,28 +58,59 @@
 
 namespace Kokkos {
 namespace Impl {
-namespace {
 
-HostThreadTeamData g_serial_thread_team_data;
+bool SerialInternal::is_initialized() { return m_is_initialized; }
 
-bool g_serial_is_initialized = false;
+void SerialInternal::initialize() {
+  if (is_initialized()) return;
 
-}  // namespace
+  Impl::SharedAllocationRecord<void, void>::tracking_enable();
+
+  // Init the array of locks used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+  m_is_initialized = true;
+}
+
+void SerialInternal::finalize() {
+  if (m_thread_team_data.scratch_buffer()) {
+    m_thread_team_data.disband_team();
+    m_thread_team_data.disband_pool();
+
+    Kokkos::HostSpace space;
+
+    space.deallocate(m_thread_team_data.scratch_buffer(),
+                     m_thread_team_data.scratch_bytes());
+
+    m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
+  }
+
+  Kokkos::Profiling::finalize();
+
+  m_is_initialized = false;
+}
+
+SerialInternal& SerialInternal::singleton() {
+  static SerialInternal* self = nullptr;
+  if (!self) {
+    self = new SerialInternal();
+  }
+  return *self;
+}
 
 // Resize thread team data scratch memory
-void serial_resize_thread_team_data(size_t pool_reduce_bytes,
-                                    size_t team_reduce_bytes,
-                                    size_t team_shared_bytes,
-                                    size_t thread_local_bytes) {
+void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes,
+                                             size_t team_reduce_bytes,
+                                             size_t team_shared_bytes,
+                                             size_t thread_local_bytes) {
   if (pool_reduce_bytes < 512) pool_reduce_bytes = 512;
   if (team_reduce_bytes < 512) team_reduce_bytes = 512;
 
-  const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
-  const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
-  const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
-  const size_t old_thread_local =
-      g_serial_thread_team_data.thread_local_bytes();
-  const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
+  const size_t old_pool_reduce  = m_thread_team_data.pool_reduce_bytes();
+  const size_t old_team_reduce  = m_thread_team_data.team_reduce_bytes();
+  const size_t old_team_shared  = m_thread_team_data.team_shared_bytes();
+  const size_t old_thread_local = m_thread_team_data.thread_local_bytes();
+  const size_t old_alloc_bytes  = m_thread_team_data.scratch_bytes();
 
   // Allocate if any of the old allocation is tool small:
 
@@ -92,12 +123,12 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
     Kokkos::HostSpace space;
 
     if (old_alloc_bytes) {
-      g_serial_thread_team_data.disband_team();
-      g_serial_thread_team_data.disband_pool();
+      m_thread_team_data.disband_team();
+      m_thread_team_data.disband_pool();
 
       space.deallocate("Kokkos::Serial::scratch_mem",
-                       g_serial_thread_team_data.scratch_buffer(),
-                       g_serial_thread_team_data.scratch_bytes());
+                       m_thread_team_data.scratch_buffer(),
+                       m_thread_team_data.scratch_bytes());
     }
 
     if (pool_reduce_bytes < old_pool_reduce) {
@@ -125,56 +156,37 @@ void serial_resize_thread_team_data(size_t pool_reduce_bytes,
       Kokkos::Impl::throw_runtime_exception(failure.get_error_message());
     }
 
-    g_serial_thread_team_data.scratch_assign(
-        ((char*)ptr), alloc_bytes, pool_reduce_bytes, team_reduce_bytes,
-        team_shared_bytes, thread_local_bytes);
+    m_thread_team_data.scratch_assign(static_cast<char*>(ptr), alloc_bytes,
+                                      pool_reduce_bytes, team_reduce_bytes,
+                                      team_shared_bytes, thread_local_bytes);
 
-    HostThreadTeamData* pool[1] = {&g_serial_thread_team_data};
+    HostThreadTeamData* pool[1] = {&m_thread_team_data};
 
-    g_serial_thread_team_data.organize_pool(pool, 1);
-    g_serial_thread_team_data.organize_team(1);
+    m_thread_team_data.organize_pool(pool, 1);
+    m_thread_team_data.organize_team(1);
   }
 }
-
-HostThreadTeamData* serial_get_thread_team_data() {
-  return &g_serial_thread_team_data;
-}
-
 }  // namespace Impl
-}  // namespace Kokkos
 
-/*--------------------------------------------------------------------------*/
+Serial::Serial()
+#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
+    : m_space_instance(&Impl::SerialInternal::singleton()) {
+}
+#else
+    : m_space_instance(&Impl::SerialInternal::singleton(),
+                       [](Impl::SerialInternal*) {}) {
+}
+#endif
 
-namespace Kokkos {
-
-bool Serial::impl_is_initialized() { return Impl::g_serial_is_initialized; }
+bool Serial::impl_is_initialized() {
+  return Impl::SerialInternal::singleton().is_initialized();
+}
 
 void Serial::impl_initialize() {
-  Impl::SharedAllocationRecord<void, void>::tracking_enable();
-
-  // Init the array of locks used for arbitrarily sized atomics
-  Impl::init_lock_array_host_space();
-
-  Impl::g_serial_is_initialized = true;
+  Impl::SerialInternal::singleton().initialize();
 }
 
-void Serial::impl_finalize() {
-  if (Impl::g_serial_thread_team_data.scratch_buffer()) {
-    Impl::g_serial_thread_team_data.disband_team();
-    Impl::g_serial_thread_team_data.disband_pool();
-
-    Kokkos::HostSpace space;
-
-    space.deallocate(Impl::g_serial_thread_team_data.scratch_buffer(),
-                     Impl::g_serial_thread_team_data.scratch_bytes());
-
-    Impl::g_serial_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
-  }
-
-  Kokkos::Profiling::finalize();
-
-  Impl::g_serial_is_initialized = false;
-}
+void Serial::impl_finalize() { Impl::SerialInternal::singleton().finalize(); }
 
 const char* Serial::name() { return "Serial"; }
 
@@ -198,6 +210,9 @@ void SerialSpaceInitializer::finalize(const bool) {
 }
 
 void SerialSpaceInitializer::fence() { Kokkos::Serial::impl_static_fence(); }
+void SerialSpaceInitializer::fence(const std::string& name) {
+  Kokkos::Serial::impl_static_fence(name);
+}
 
 void SerialSpaceInitializer::print_configuration(std::ostream& msg,
                                                  const bool detail) {
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
index 3ac3899aca..be732f4486 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_Task.hpp
@@ -76,14 +76,18 @@ class TaskQueueSpecialization<SimpleTaskScheduler<Kokkos::Serial, QueueType> > {
   static void execute(scheduler_type const& scheduler) {
     using task_base_type = typename scheduler_type::task_base_type;
 
-    // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    auto const& serial_execution_space = scheduler.get_execution_space();
 
-    Impl::HostThreadTeamData& self = *Impl::serial_get_thread_team_data();
+    // Set default buffers
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
+
+    auto& self = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
     auto& queue         = scheduler.queue();
     auto team_scheduler = scheduler.get_team_scheduler(0);
@@ -147,9 +151,11 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    execution_space serial_execution_space;
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until no runnable task
 
@@ -181,18 +187,22 @@ class TaskQueueSpecializationConstrained<
 
     task_base_type* const end = (task_base_type*)task_base_type::EndTag;
 
+    execution_space serial_execution_space;
+
     // Set default buffers
-    serial_resize_thread_team_data(0,   /* global reduce buffer */
-                                   512, /* team reduce buffer */
-                                   0,   /* team shared buffer */
-                                   0    /* thread local buffer */
-    );
+    serial_execution_space.impl_internal_space_instance()
+        ->resize_thread_team_data(0,   /* global reduce buffer */
+                                  512, /* team reduce buffer */
+                                  0,   /* team shared buffer */
+                                  0    /* thread local buffer */
+        );
 
     auto* const queue = scheduler.m_queue;
 
-    Impl::HostThreadTeamData* const data = Impl::serial_get_thread_team_data();
+    auto& data = serial_execution_space.impl_internal_space_instance()
+                     ->m_thread_team_data;
 
-    member_type exec(scheduler, *data);
+    member_type exec(scheduler, data);
 
     // Loop until all queues are empty
     while (0 < queue->m_ready_count) {
@@ -210,16 +220,6 @@ class TaskQueueSpecializationConstrained<
 
         (*task->m_apply)(task, &exec);
 
-#if 0
-        printf( "TaskQueue<Serial>::executed: 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
-        , uintptr_t(task)
-        , uintptr_t(task->m_wait)
-        , uintptr_t(task->m_next)
-        , task->m_task_type
-        , task->m_priority
-        , task->m_ref_count );
-#endif
-
         // If a respawn then re-enqueue otherwise the task is complete
         // and all tasks waiting on this task are updated.
         queue->complete(task);
diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
index 917ae72081..3efff98e45 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp
@@ -259,6 +259,9 @@ SharedAllocationRecord<void, void>* SharedAllocationRecord<
     while ((root_next = Kokkos::atomic_exchange(&arg_record->m_root->m_next,
                                                 zero)) == nullptr)
       ;
+    // We need a memory_fence() here so that the following update
+    // is properly sequenced
+    Kokkos::memory_fence();
 
     arg_record->m_next->m_prev = arg_record->m_prev;
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp b/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
index 0773a0914b..7f222c92ca 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SimpleTaskScheduler.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
index a0eccffb62..0584cd29eb 100644
--- a/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_SingleTaskQueue.hpp
@@ -58,8 +58,7 @@
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
 #include <impl/Kokkos_TaskQueueCommon.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
deleted file mode 100644
index eea4c93866..0000000000
--- a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#ifndef KOKKOS_TAGS_HPP
-#define KOKKOS_TAGS_HPP
-
-#include <impl/Kokkos_Traits.hpp>
-#include <Kokkos_Core_fwd.hpp>
-#include <type_traits>
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-/** KOKKOS_IMPL_HAS_TYPE( Type )
- *
- * defines a meta-function that check if a type expose an internal alias which
- * matches Type
- *
- * e.g.
- *   KOKKOS_IMPL_HAS_TYPE( array_layout );
- *   struct Foo { using array_layout = void; };
- *   have_array_layout<Foo>::value == 1;
- */
-#define KOKKOS_IMPL_HAS_TYPE(TYPE)                                             \
-  template <typename T>                                                        \
-  struct have_##TYPE {                                                         \
-   private:                                                                    \
-    template <typename U, typename = void>                                     \
-    struct X : std::false_type {};                                             \
-    template <typename U>                                                      \
-    struct X<U, typename std::conditional<true, void, typename X::TYPE>::type> \
-        : std::true_type {};                                                   \
-                                                                               \
-   public:                                                                     \
-    using type = typename X<T>::type;                                          \
-    enum : bool { value = type::value };                                       \
-  };
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-namespace Kokkos {
-namespace Impl {
-
-template <typename T>
-using is_void = std::is_same<void, T>;
-
-}
-}  // namespace Kokkos
-
-//----------------------------------------------------------------------------
-//----------------------------------------------------------------------------
-
-#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
index 2d0f62a563..06581052a8 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskBase.hpp
@@ -203,14 +203,17 @@ class TaskBase {
 
     // Assign dependence to m_next.  It will be processed in the subsequent
     // call to schedule.  Error if the dependence is reset.
-    if (lock != Kokkos::atomic_exchange(&m_next, dep)) {
+    if (lock != Kokkos::Impl::desul_atomic_exchange(
+                    &m_next, dep, Kokkos::Impl::MemoryOrderSeqCst(),
+                    Kokkos::Impl::MemoryScopeDevice())) {
       Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
     }
-
     if (nullptr != dep) {
       // The future may be destroyed upon returning from this call
       // so increment reference count to track this assignment.
-      Kokkos::atomic_increment(&(dep->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&(dep->m_ref_count),
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
   }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
index 42afa93cdc..caf1d0a84b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskNode.hpp
@@ -151,6 +151,7 @@ class ReferenceCountedBase {
   bool decrement_and_check_reference_count() {
     // TODO @tasking @memory_order DSH memory order
     auto old_count = Kokkos::atomic_fetch_add(&m_ref_count, -1);
+    Kokkos::memory_fence();
 
     KOKKOS_ASSERT(old_count > 0 && "reference count greater less than zero!");
 
@@ -158,7 +159,11 @@ class ReferenceCountedBase {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void increment_reference_count() { Kokkos::atomic_increment(&m_ref_count); }
+  void increment_reference_count() {
+    Kokkos::Impl::desul_atomic_inc(&m_ref_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
+  }
 };
 
 template <class TaskQueueTraits, class SchedulingInfo>
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
index c0d2eca9c1..e74e84a2e5 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue.hpp
@@ -58,8 +58,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -188,25 +187,11 @@ class TaskQueue : public TaskQueueBase {
   // Assign task pointer with reference counting of assigned tasks
   KOKKOS_FUNCTION static void assign(task_root_type** const lhs,
                                      task_root_type* const rhs) {
-#if 0
-  {
-    printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
-          , uintptr_t( lhs ? *lhs : 0 )
-          , uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_task_type : 0 )
-          , int( lhs && *lhs ? (*lhs)->m_ref_count : 0 )
-          , uintptr_t(rhs)
-          , uintptr_t( rhs ? rhs->m_next : 0 )
-          , int( rhs ? rhs->m_task_type : 0 )
-          , int( rhs ? rhs->m_ref_count : 0 )
-          );
-    fflush( stdout );
-  }
-#endif
-
     if (*lhs) decrement(*lhs);
     if (rhs) {
-      Kokkos::atomic_increment(&(rhs->m_ref_count));
+      Kokkos::Impl::desul_atomic_inc(&rhs->m_ref_count,
+                                     Kokkos::Impl::MemoryOrderSeqCst(),
+                                     Kokkos::Impl::MemoryScopeDevice());
     }
 
     // Force write of *lhs
@@ -234,13 +219,7 @@ class TaskQueue : public TaskQueueBase {
 
     using task_type = Impl::Task<execution_space, value_type, FunctorType>;
 
-    enum : size_t { align = (1 << 4), align_mask = align - 1 };
-    enum : size_t { task_size = sizeof(task_type) };
-    enum : size_t { result_size = Impl::TaskResult<value_type>::size };
-    enum : size_t {
-      alloc_size = ((task_size + align_mask) & ~align_mask) +
-                   ((result_size + align_mask) & ~align_mask)
-    };
+    constexpr size_t task_size = sizeof(task_type);
 
     return m_memory.allocate_block_size(task_size);
   }
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
index cae06d4ea5..757e5f9886 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueCommon.hpp
@@ -57,8 +57,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 
 #include <impl/Kokkos_TaskQueueMemoryManager.hpp>
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -88,6 +87,7 @@ class TaskQueueCommonMixin {
   // <editor-fold desc="Constructors, destructor, and assignment"> {{{2
 
   TaskQueueCommonMixin() : m_ready_count(0) {
+    Kokkos::memory_fence();
     // TODO @tasking @memory_order DSH figure out if I need this store to be
     // atomic
   }
@@ -158,14 +158,17 @@ class TaskQueueCommonMixin {
   KOKKOS_INLINE_FUNCTION
   void _increment_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_increment(&this->m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
   KOKKOS_INLINE_FUNCTION
   void _decrement_ready_count() {
     // TODO @tasking @memory_order DSH memory order
-    Kokkos::atomic_decrement(&this->m_ready_count);
-    Kokkos::memory_fence();
+    Kokkos::Impl::desul_atomic_dec(&this->m_ready_count,
+                                   Kokkos::Impl::MemoryOrderSeqCst(),
+                                   Kokkos::Impl::MemoryScopeDevice());
   }
 
  public:
@@ -476,7 +479,7 @@ class TaskQueueCommonMixin {
   }
 
   template <class ExecutionSpace, class MemorySpace, class MemoryPool>
-  static /* KOKKOS_CONSTEXPR_14 */ size_t task_queue_allocation_size(
+  static /* constexpr */ size_t task_queue_allocation_size(
       ExecutionSpace const&, MemorySpace const&, MemoryPool const&)
   // requires Same<ExecutionSpace, typename Derived::execution_space>
   //            && Same<MemorySpace, typename Derived::memory_space>
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
index 6e2481f935..3a71aa17e6 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMemoryManager.hpp
@@ -56,8 +56,7 @@
 #include <impl/Kokkos_TaskBase.hpp>
 #include <impl/Kokkos_TaskResult.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
+#include <Kokkos_Atomic.hpp>
 #include <impl/Kokkos_OptionalRef.hpp>
 #include <impl/Kokkos_LIFO.hpp>
 
@@ -103,8 +102,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
     } else {
       void* data = m_pool.allocate(static_cast<size_t>(requested_size));
 
-      // Kokkos::atomic_increment(&m_accum_alloc); // memory_order_relaxed
-      Kokkos::atomic_increment(&m_count_alloc);  // memory_order_relaxed
+      Kokkos::Impl::desul_atomic_inc(
+          &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+          Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
       // TODO @tasking @minor DSH make this thread safe? (otherwise, it's just
       // an approximation, which is probably fine...)
       if (m_max_alloc < m_count_alloc) m_max_alloc = m_count_alloc;
@@ -200,7 +200,9 @@ class TaskQueueMemoryManager : public TaskQueueBase {
   KOKKOS_INLINE_FUNCTION void deallocate(
       PoolAllocatedObjectBase<CountType>&& obj) {
     m_pool.deallocate((void*)&obj, 1);
-    Kokkos::atomic_decrement(&m_count_alloc);  // memory_order_relaxed
+    Kokkos::Impl::desul_atomic_dec(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
index efee3d051d..5f98e8d85e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueueMultiple.hpp
@@ -59,9 +59,7 @@
 #include <impl/Kokkos_TaskResult.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
 
-#include <impl/Kokkos_Memory_Fence.hpp>
-#include <impl/Kokkos_Atomic_Increment.hpp>
-#include <impl/Kokkos_Atomic_Decrement.hpp>
+#include <Kokkos_Atomic.hpp>
 
 #include <string>
 #include <typeinfo>
@@ -159,8 +157,14 @@ class TaskQueueMultiple : public TaskQueue<ExecSpace, MemorySpace> {
               // task stolen.
               // first increment our ready count, then decrement the ready count
               // on the other queue:
-              Kokkos::atomic_increment(&this->m_ready_count);
-              Kokkos::atomic_decrement(&steal_from.m_ready_count);
+              Kokkos::Impl::desul_atomic_inc(
+                  &this->m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
+              Kokkos::Impl::desul_atomic_dec(
+                  &steal_from.m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice());  // TODO?
+                                                       // memory_order_relaxed
               return rv;
             }
           }
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
index a87e5f7272..324227cf5e 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskQueue_impl.hpp
@@ -105,6 +105,7 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::decrement(
   task_root_type volatile &t = *task;
 
   const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count), -1);
+  Kokkos::memory_fence();
 
 #if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
   if (1 == count) {
@@ -146,8 +147,9 @@ KOKKOS_FUNCTION void *TaskQueue<ExecSpace, MemorySpace>::allocate(size_t n) {
   void *const p = m_memory.allocate(n);
 
   if (p) {
-    // Kokkos::atomic_increment( & m_accum_alloc );
-    Kokkos::atomic_increment(&m_count_alloc);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     // if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
   }
@@ -159,7 +161,9 @@ template <typename ExecSpace, typename MemorySpace>
 KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::deallocate(void *p,
                                                                    size_t n) {
   m_memory.deallocate(p, n);
-  Kokkos::atomic_decrement(&m_count_alloc);
+  Kokkos::Impl::desul_atomic_dec(
+      &m_count_alloc, Kokkos::Impl::MemoryOrderSeqCst(),
+      Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 }
 
 //----------------------------------------------------------------------------
@@ -210,7 +214,9 @@ KOKKOS_FUNCTION bool TaskQueue<ExecSpace, MemorySpace>::push_task(
     //     *queue = task;
     //   }
     //   old_head = *queue;
-    old_head = Kokkos::atomic_compare_exchange(queue, old_head, task);
+    old_head = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), old_head, task,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (old_head_tmp == old_head) return true;
   }
@@ -258,7 +264,10 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
 
     task_root_type *const x = task;
 
-    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    //    task = Kokkos::atomic_compare_exchange(queue, x, lock);
+    task = Kokkos::Impl::desul_atomic_compare_exchange(
+        const_cast<task_root_type **>(queue), x, lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x == task) {
       // CAS succeeded and queue is locked
@@ -274,6 +283,8 @@ TaskQueue<ExecSpace, MemorySpace>::pop_ready_task(
       // This thread has exclusive access to
       // the queue and the popped task's m_next.
 
+      Kokkos::memory_fence();
+
       task_root_type *volatile &next = task->m_next;
 
       // This algorithm is not lockfree because a adversarial scheduler could
@@ -400,7 +411,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::schedule_runnable(
     // to track number of ready + executing tasks.
     // The ready count will be decremented when the task is complete.
 
-    Kokkos::atomic_increment(&m_ready_count);
+    Kokkos::Impl::desul_atomic_inc(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
 
     task_root_type *volatile *const ready_queue =
         &m_ready[t.m_priority][t.m_task_type];
@@ -553,8 +566,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::reschedule(
 
   task_root_type *const zero = nullptr;
   task_root_type *const lock = (task_root_type *)task_root_type::LockTag;
-
-  if (lock != Kokkos::atomic_exchange(&task->m_next, zero)) {
+  if (lock != Kokkos::Impl::desul_atomic_exchange(
+                  &task->m_next, zero, Kokkos::Impl::MemoryOrderSeqCst(),
+                  Kokkos::Impl::MemoryScopeDevice())) {
     Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
   }
 }
@@ -601,8 +615,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
 
     // Stop other tasks from adding themselves to this task's wait queue
     // by locking the head of this task's wait queue.
-
-    task_root_type *x = Kokkos::atomic_exchange(&t.m_wait, lock);
+    task_root_type *x = Kokkos::Impl::desul_atomic_exchange(
+        const_cast<task_root_type **>(&t.m_wait), lock,
+        Kokkos::Impl::MemoryOrderSeqCst(), Kokkos::Impl::MemoryScopeDevice());
 
     if (x != (task_root_type *)lock) {
       // This thread has transitioned this 'task' to complete.
@@ -645,7 +660,9 @@ KOKKOS_FUNCTION void TaskQueue<ExecSpace, MemorySpace>::complete(
     // A runnable task was popped from a ready queue and executed.
     // If respawned into a ready queue then the ready count was incremented
     // so decrement whether respawned or not.
-    Kokkos::atomic_decrement(&m_ready_count);
+    Kokkos::Impl::desul_atomic_dec(
+        &m_ready_count, Kokkos::Impl::MemoryOrderSeqCst(),
+        Kokkos::Impl::MemoryScopeDevice());  // TODO? memory_order_relaxed
   }
 }
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp b/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
index 2faab57949..f53dfe5a96 100644
--- a/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_TaskTeamMember.hpp
@@ -55,7 +55,6 @@
 //----------------------------------------------------------------------------
 
 #include <Kokkos_MemoryPool.hpp>
-#include <impl/Kokkos_Tags.hpp>
 
 #include <Kokkos_Future.hpp>
 #include <impl/Kokkos_TaskQueue.hpp>
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
index e8004ff852..6edf571d78 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -45,8 +45,13 @@
 #ifndef KOKKOS_IMPLWALLTIME_HPP
 #define KOKKOS_IMPLWALLTIME_HPP
 
+#include <Kokkos_Macros.hpp>
+
+KOKKOS_IMPL_WARNING("This file is deprecated. Use <Kokkos_Timer.hpp> instead.")
+
 #include <Kokkos_Timer.hpp>
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
 namespace Kokkos {
 namespace Impl {
 
@@ -54,10 +59,11 @@ namespace Impl {
  *   Timer promoted from Impl to Kokkos ns
  *   This file included for backwards compatibility
  */
-
-using Kokkos::Timer;
+using Timer KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::Timer instead!") =
+    Kokkos::Timer;
 
 }  // namespace Impl
 }  // namespace Kokkos
+#endif
 
 #endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
index cb8cf281ae..bea7c2c9d1 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp
@@ -65,13 +65,6 @@ struct identity {
 template <typename T>
 using identity_t = typename identity<T>::type;
 
-struct not_a_type {
-  not_a_type()                  = delete;
-  ~not_a_type()                 = delete;
-  not_a_type(not_a_type const&) = delete;
-  void operator=(not_a_type const&) = delete;
-};
-
 #if defined(__cpp_lib_void_t)
 // since C++17
 using std::void_t;
@@ -158,6 +151,112 @@ struct destruct_delete {
 template <class...>
 struct type_list;
 
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_remove_first"> {{{2
+
+// Currently linear complexity; if we use this a lot, maybe make it better?
+
+template <class Entry, class InList, class OutList>
+struct _type_list_remove_first_impl;
+
+template <class Entry, class T, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<T, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<Ts...>,
+                                   type_list<OutTs..., T>> {};
+
+template <class Entry, class... Ts, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<Entry, Ts...>,
+                                    type_list<OutTs...>>
+    : _type_list_remove_first_impl<Entry, type_list<>,
+                                   type_list<OutTs..., Ts...>> {};
+
+template <class Entry, class... OutTs>
+struct _type_list_remove_first_impl<Entry, type_list<>, type_list<OutTs...>>
+    : identity<type_list<OutTs...>> {};
+
+template <class Entry, class List>
+struct type_list_remove_first
+    : _type_list_remove_first_impl<Entry, List, type_list<>> {};
+
+// </editor-fold> end type_list_remove_first }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="type_list_any"> {{{2
+
+template <template <class> class UnaryPred, class List>
+struct type_list_any;
+
+#ifdef KOKKOS_ENABLE_CXX17
+template <template <class> class UnaryPred, class... Ts>
+struct type_list_any<UnaryPred, type_list<Ts...>>
+    : std::bool_constant<(UnaryPred<Ts>::value || ...)> {};
+#else
+template <template <class> class UnaryPred, class T, class... Ts>
+struct type_list_any<UnaryPred, type_list<T, Ts...>> {
+  using type = typename std::conditional_t<
+      UnaryPred<T>::value, std::true_type,
+      type_list_any<UnaryPred, type_list<Ts...>>>::type;
+  static constexpr auto value = type::value;
+};
+
+template <template <class> class UnaryPred>
+struct type_list_any<UnaryPred, type_list<>> : std::false_type {};
+
+#endif
+
+// </editor-fold> end type_list_any }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="concat_type_list"> {{{2
+//  concat_type_list combines types in multiple type_lists
+
+// forward declaration
+template <typename... T>
+struct concat_type_list;
+
+// alias
+template <typename... T>
+using concat_type_list_t = typename concat_type_list<T...>::type;
+
+// final instantiation
+template <typename... T>
+struct concat_type_list<type_list<T...>> {
+  using type = type_list<T...>;
+};
+
+// combine consecutive type_lists
+template <typename... T, typename... U, typename... Tail>
+struct concat_type_list<type_list<T...>, type_list<U...>, Tail...>
+    : concat_type_list<type_list<T..., U...>, Tail...> {};
+// </editor-fold> end concat_type_list }}}2
+//------------------------------------------------------------------------------
+
+//------------------------------------------------------------------------------
+// <editor-fold desc="filter_type_list"> {{{2
+//  filter_type_list generates type-list of types which satisfy
+//  PredicateT<T>::value == ValueT
+
+template <template <typename> class PredicateT, typename TypeListT,
+          bool ValueT = true>
+struct filter_type_list;
+
+template <template <typename> class PredicateT, typename... T, bool ValueT>
+struct filter_type_list<PredicateT, type_list<T...>, ValueT> {
+  using type =
+      concat_type_list_t<std::conditional_t<PredicateT<T>::value == ValueT,
+                                            type_list<T>, type_list<>>...>;
+};
+
+template <template <typename> class PredicateT, typename T, bool ValueT = true>
+using filter_type_list_t =
+    typename filter_type_list<PredicateT, T, ValueT>::type;
+
+// </editor-fold> end filter_type_list }}}2
+//------------------------------------------------------------------------------
+
 // </editor-fold> end type_list }}}1
 //==============================================================================
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp b/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
index 41607a2a8e..ace826dd5a 100644
--- a/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_VLAEmulation.hpp
@@ -130,20 +130,20 @@ struct ObjectWithVLAEmulation {
   // CRTP boilerplate
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived* _this() noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   Derived const* _this() const noexcept {
     return VLAEmulationAccess::_cast_to_derived(this);
   }
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type* _vla_pointer() noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type*>(_this() + 1);
@@ -151,7 +151,7 @@ struct ObjectWithVLAEmulation {
 
   // Note: can't be constexpr because of reinterpret_cast
   KOKKOS_FORCEINLINE_FUNCTION
-  /* KOKKOS_CONSTEXPR_14 */
+  /* constexpr */
   vla_value_type const* _vla_pointer() const noexcept {
     // The data starts right after the aligned storage of Derived
     return reinterpret_cast<vla_value_type const*>(_this() + 1);
@@ -159,7 +159,7 @@ struct ObjectWithVLAEmulation {
 
  public:
   KOKKOS_INLINE_FUNCTION
-  static /* KOKKOS_CONSTEXPR_14 */ size_t required_allocation_size(
+  static /* constexpr */ size_t required_allocation_size(
       vla_entry_count_type num_vla_entries) {
     KOKKOS_EXPECTS(num_vla_entries >= 0);
     return sizeof(Derived) + num_vla_entries * sizeof(VLAValueType);
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
index b9e32a04e0..797b3f584b 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@@ -144,10 +144,10 @@ struct ViewCtorProp<typename std::enable_if<is_view_label<Label>::value>::type,
 };
 
 template <typename Space>
-struct ViewCtorProp<typename std::enable_if<
-                        Kokkos::Impl::is_memory_space<Space>::value ||
-                        Kokkos::Impl::is_execution_space<Space>::value>::type,
-                    Space> {
+struct ViewCtorProp<
+    typename std::enable_if<Kokkos::is_memory_space<Space>::value ||
+                            Kokkos::is_execution_space<Space>::value>::type,
+    Space> {
   ViewCtorProp()                     = default;
   ViewCtorProp(const ViewCtorProp &) = default;
   ViewCtorProp &operator=(const ViewCtorProp &) = default;
@@ -207,10 +207,10 @@ template <typename... P>
 struct ViewCtorProp : public ViewCtorProp<void, P>... {
  private:
   using var_memory_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_memory_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_memory_space, P...>;
 
   using var_execution_space =
-      Kokkos::Impl::has_condition<void, Kokkos::Impl::is_execution_space, P...>;
+      Kokkos::Impl::has_condition<void, Kokkos::is_execution_space, P...>;
 
   struct VOIDDUMMY {};
 
@@ -270,7 +270,6 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
 
 namespace Kokkos {
 
-/* For backward compatibility */
 namespace Impl {
 struct ViewAllocateWithoutInitializingBackwardCompat {};
 
@@ -291,7 +290,6 @@ struct ViewCtorProp<WithoutInitializing_t, std::string,
 };
 } /* namespace Impl */
 
-/*[[deprecated(Use Kokkos::alloc(Kokkos::WithoutInitializing, label) instead]]*/
 using ViewAllocateWithoutInitializing =
     Impl::ViewCtorProp<Impl::WithoutInitializing_t, std::string,
                        Impl::ViewAllocateWithoutInitializingBackwardCompat>;
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
index a380a30693..9523118748 100644
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@@ -49,6 +49,7 @@
 #include <initializer_list>
 
 #include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DetectionIdiom.hpp>
 #include <Kokkos_Pair.hpp>
 #include <Kokkos_Layout.hpp>
 #include <Kokkos_Extents.hpp>
@@ -862,7 +863,7 @@ struct ViewDataAnalysis {
 namespace Kokkos {
 namespace Impl {
 
-template <class Dimension, class Layout, typename Enable = void>
+template <class Dimension, class Layout, class Enable = void>
 struct ViewOffset {
   using is_mapping_plugin = std::false_type;
 };
@@ -1389,7 +1390,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2022,7 +2024,8 @@ struct ViewOffset<
     KOKKOS_INLINE_FUNCTION
     static constexpr size_t stride(size_t const N) {
       return ((align != 0) &&
-              ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) &&
+              ((static_cast<int>(Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD) *
+                static_cast<int>(align)) < N) &&
               ((N % div_ok) != 0))
                  ? N + align - (N % div_ok)
                  : N;
@@ -2816,6 +2819,22 @@ struct ViewDataHandle<
 namespace Kokkos {
 namespace Impl {
 
+template <typename T>
+inline bool is_zero_byte(const T& t) {
+  using comparison_type = std::conditional_t<
+      sizeof(T) % sizeof(long long int) == 0, long long int,
+      std::conditional_t<
+          sizeof(T) % sizeof(long int) == 0, long int,
+          std::conditional_t<
+              sizeof(T) % sizeof(int) == 0, int,
+              std::conditional_t<sizeof(T) % sizeof(short int) == 0, short int,
+                                 char>>>>;
+  const auto* const ptr = reinterpret_cast<const comparison_type*>(&t);
+  for (std::size_t i = 0; i < sizeof(T) / sizeof(comparison_type); ++i)
+    if (ptr[i] != 0) return false;
+  return true;
+}
+
 //----------------------------------------------------------------------------
 
 /*
@@ -2826,16 +2845,16 @@ namespace Impl {
  *  called from the shared memory tracking destruction.
  *  Secondarily to have two fewer partial specializations.
  */
-template <class ExecSpace, class ValueType,
+template <class DeviceType, class ValueType,
           bool IsScalar = std::is_scalar<ValueType>::value>
 struct ViewValueFunctor;
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, false /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
-  using Exec       = typename ExecSpace::execution_space;
 
-  Exec space;
+  ExecSpace space;
   ValueType* ptr;
   size_t n;
   bool destroy;
@@ -2864,11 +2883,50 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         destroy(false),
         name(std::move(arg_name)) {}
 
-  void execute(bool arg) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<ValueType>::value>
+  construct_dispatch() {
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
+      uint64_t kpID = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
+        Kokkos::Profiling::beginParallelFor(
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation(false);
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<ValueType>::value)>
+  construct_dispatch() {
+    parallel_for_implementation(false);
+  }
+
+  void parallel_for_implementation(bool arg) {
     destroy = arg;
-    PolicyType policy(0, n);
-    std::string functor_name;
     if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name;
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         functor_name =
@@ -2877,6 +2935,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
         Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
                                                 kpID);
       }
+
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
         Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n,
@@ -2886,7 +2945,7 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, policy);
       closure.execute();
-      space.fence();
+      space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
         Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
                                               kpID);
@@ -2896,13 +2955,14 @@ struct ViewValueFunctor<ExecSpace, ValueType, false /* is_scalar */> {
     }
   }
 
-  void construct_shared_allocation() { execute(false); }
+  void construct_shared_allocation() { construct_dispatch(); }
 
-  void destroy_shared_allocation() { execute(true); }
+  void destroy_shared_allocation() { parallel_for_implementation(true); }
 };
 
-template <class ExecSpace, class ValueType>
-struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
+template <class DeviceType, class ValueType>
+struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
+  using ExecSpace  = typename DeviceType::execution_space;
   using PolicyType = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int64_t>>;
 
   ExecSpace space;
@@ -2921,12 +2981,54 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
                    size_t const arg_n, std::string arg_name)
       : space(arg_space), ptr(arg_ptr), n(arg_n), name(std::move(arg_name)) {}
 
-  void construct_shared_allocation() {
-    if (!space.in_parallel()) {
+  template <typename Dummy = ValueType>
+  std::enable_if_t<std::is_trivial<Dummy>::value &&
+                   std::is_trivially_copy_assignable<Dummy>::value>
+  construct_shared_allocation() {
+    // Shortcut for zero initialization
+    ValueType value{};
+    if (Impl::is_zero_byte(value)) {
       uint64_t kpID = 0;
       if (Kokkos::Profiling::profileLibraryLoaded()) {
+        // We are not really using parallel_for here but using beginParallelFor
+        // instead of begin_parallel_for (and adding "via memset") is the best
+        // we can do to indicate that this is not supposed to be tunable (and
+        // doesn't really execute a parallel_for).
         Kokkos::Profiling::beginParallelFor(
-            "Kokkos::View::initialization [" + name + "]", 0, &kpID);
+            "Kokkos::View::initialization [" + name + "] via memset",
+            Kokkos::Profiling::Experimental::device_id(space), &kpID);
+      }
+
+      (void)ZeroMemset<ExecSpace, ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(
+          space,
+          Kokkos::View<ValueType*, typename DeviceType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::Unmanaged>>(ptr, n),
+          value);
+
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Profiling::endParallelFor(kpID);
+      }
+    } else {
+      parallel_for_implementation();
+    }
+  }
+
+  template <typename Dummy = ValueType>
+  std::enable_if_t<!(std::is_trivial<Dummy>::value &&
+                     std::is_trivially_copy_assignable<Dummy>::value)>
+  construct_shared_allocation() {
+    parallel_for_implementation();
+  }
+
+  void parallel_for_implementation() {
+    if (!space.in_parallel()) {
+      PolicyType policy(0, n);
+      std::string functor_name = "Kokkos::View::initialization [" + name + "]";
+      uint64_t kpID            = 0;
+      if (Kokkos::Profiling::profileLibraryLoaded()) {
+        Kokkos::Tools::Impl::begin_parallel_for(policy, *this, functor_name,
+                                                kpID);
       }
 #ifdef KOKKOS_ENABLE_CUDA
       if (std::is_same<ExecSpace, Kokkos::Cuda>::value) {
@@ -2937,9 +3039,11 @@ struct ViewValueFunctor<ExecSpace, ValueType, true /* is_scalar */> {
       const Kokkos::Impl::ParallelFor<ViewValueFunctor, PolicyType> closure(
           *this, PolicyType(0, n));
       closure.execute();
-      space.fence();
+      space.fence(
+          "Kokkos::Impl::ViewValueFunctor: Fence after setting values in view");
       if (Kokkos::Profiling::profileLibraryLoaded()) {
-        Kokkos::Profiling::endParallelFor(kpID);
+        Kokkos::Tools::Impl::end_parallel_for(policy, *this, functor_name,
+                                              kpID);
       }
     } else {
       for (size_t i = 0; i < n; ++i) operator()(i);
@@ -3232,7 +3336,9 @@ class ViewMapping<
     using execution_space = typename alloc_prop::execution_space;
     using memory_space    = typename Traits::memory_space;
     using value_type      = typename Traits::value_type;
-    using functor_type    = ViewValueFunctor<execution_space, value_type>;
+    using functor_type =
+        ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
+                         value_type>;
     using record_type =
         Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;
 
@@ -3314,17 +3420,10 @@ class ViewMapping<
                            Kokkos::LayoutStride>::value))))>::type> {
  private:
   enum {
-    is_assignable_space =
-#if 1
-        Kokkos::Impl::MemorySpaceAccess<
-            typename DstTraits::memory_space,
-            typename SrcTraits::memory_space>::assignable
+    is_assignable_space = Kokkos::Impl::MemorySpaceAccess<
+        typename DstTraits::memory_space,
+        typename SrcTraits::memory_space>::assignable
   };
-#else
-        std::is_same<typename DstTraits::memory_space,
-                     typename SrcTraits::memory_space>::value
-  };
-#endif
 
   enum {
     is_assignable_value_type =
@@ -3728,7 +3827,7 @@ class ViewMapping<
 
   template <class MemoryTraits>
   struct apply {
-    static_assert(Kokkos::Impl::is_memory_traits<MemoryTraits>::value, "");
+    static_assert(Kokkos::is_memory_traits<MemoryTraits>::value, "");
 
     using traits_type =
         Kokkos::ViewTraits<data_type, array_layout,
@@ -3842,24 +3941,21 @@ struct OperatorBoundsErrorOnDevice<MapType, true> {
    this defined by default.
    The existence of this alias indicates the existence of MapType::is_managed
  */
-template <class T, class Enable = void>
-struct has_printable_label_typedef : public std::false_type {};
-
 template <class T>
-struct has_printable_label_typedef<T,
-                                   void_t<typename T::printable_label_typedef>>
-    : public std::true_type {};
+using printable_label_typedef_t = typename T::printable_label_typedef;
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const&,
-                                                            std::false_type) {
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<!is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const&) {
   Kokkos::abort("View bounds error");
 }
 
-template <class MapType>
-KOKKOS_INLINE_FUNCTION void operator_bounds_error_on_device(MapType const& map,
-                                                            std::true_type) {
-  OperatorBoundsErrorOnDevice<MapType>::run(map);
+template <class Map>
+KOKKOS_FUNCTION
+    std::enable_if_t<is_detected<printable_label_typedef_t, Map>::value>
+    operator_bounds_error_on_device(Map const& map) {
+  OperatorBoundsErrorOnDevice<Map>::run(map);
 }
 
 #endif  // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@@ -3885,8 +3981,7 @@ KOKKOS_INLINE_FUNCTION void view_verify_operator_bounds(
        This check should cover the case of Views that don't
        have the Unmanaged trait but were initialized by pointer. */
     if (tracker.m_tracker.has_record()) {
-      operator_bounds_error_on_device<MapType>(
-          map, has_printable_label_typedef<MapType>());
+      operator_bounds_error_on_device(map);
     } else {
       Kokkos::abort("View bounds error");
     }
diff --git a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
index a5f5406746..d964baa8fb 100644
--- a/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
+++ b/lib/kokkos/core/src/setup/Kokkos_Setup_SYCL.hpp
@@ -62,10 +62,10 @@ void sink(Args&&... args) {
     Kokkos::ImplSYCL::sink(__VA_ARGS__);   \
   } while (0)
 #else
-#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                       \
-  do {                                                                   \
-    static const __attribute__((opencl_constant)) char fmt[] = (format); \
-    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);              \
+#define KOKKOS_IMPL_DO_NOT_USE_PRINTF(format, ...)                \
+  do {                                                            \
+    const __attribute__((opencl_constant)) char fmt[] = (format); \
+    sycl::ONEAPI::experimental::printf(fmt, ##__VA_ARGS__);       \
   } while (0)
 #endif
 #endif
diff --git a/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
index 4467b2e03c..e12d1f6a49 100644
--- a/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_ExecutionSpaceTrait.hpp
@@ -56,6 +56,11 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_execution_space_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_execution_space_erroneously_given_to_execution_policy<void> {
+};
 struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
   struct base_traits {
     static constexpr auto execution_space_is_defaulted = true;
@@ -63,32 +68,30 @@ struct ExecutionSpaceTrait : TraitSpecificationBase<ExecutionSpaceTrait> {
     using execution_space = Kokkos::DefaultExecutionSpace;
   };
   template <class T>
-  using trait_matches_specification = is_execution_space<T>;
+  using trait_matches_specification = Kokkos::is_execution_space<T>;
+  template <class ExecSpace, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr auto show_execution_space_error_in_compilation_message =
+        show_extra_execution_space_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::execution_space_is_defaulted, void,
+                               typename base_t::execution_space>>{};
+    static_assert(base_t::execution_space_is_defaulted,
+                  "Kokkos Error: More than one execution space given. Search "
+                  "compiler output for 'show_extra_execution_space' to see the "
+                  "type of the errant tag.");
+
+    static constexpr auto execution_space_is_defaulted = false;
+
+    using execution_space = ExecSpace;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class ExecutionSpace, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<Kokkos::is_execution_space<ExecutionSpace>::value>,
-    ExecutionSpace, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-
-  static_assert(base_t::execution_space_is_defaulted,
-                "Kokkos Error: More than one execution space given");
-
-  static constexpr bool execution_space_is_defaulted = false;
-
-  using execution_space = ExecutionSpace;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
index eb649dc088..b57dfbbc07 100644
--- a/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_GraphKernelTrait.hpp
@@ -61,6 +61,12 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
   struct base_traits {
     using is_graph_kernel = std::false_type;
   };
+  template <class, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using is_graph_kernel = std::true_type;
+  };
   template <class T>
   using trait_matches_specification = std::is_same<T, IsGraphKernelTag>;
 };
@@ -68,19 +74,6 @@ struct GraphKernelTrait : TraitSpecificationBase<GraphKernelTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Impl::IsGraphKernelTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  using is_graph_kernel = std::true_type;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
index e15adc1711..63446375fb 100644
--- a/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_IndexTypeTrait.hpp
@@ -46,54 +46,71 @@
 #define KOKKOS_KOKKOS_INDEXTYPETRAIT_HPP
 
 #include <Kokkos_Macros.hpp>
-#include <Kokkos_Concepts.hpp>  // IndexType, is_index_type
+#include <Kokkos_Concepts.hpp>  // IndexType
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
 
 namespace Kokkos {
 namespace Impl {
 
+template <class Trait, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_index_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_index_type_erroneously_given_to_execution_policy<void> {};
 struct IndexTypeTrait : TraitSpecificationBase<IndexTypeTrait> {
   struct base_traits {
     static constexpr bool index_type_is_defaulted = true;
     using index_type = dependent_policy_trait_default;
   };
-  template <class T>
-  using trait_matches_specification =
-      std::integral_constant<bool, std::is_integral<T>::value ||
-                                       is_index_type<T>::value>;
+  template <class IdxType, class AnalyzeNextTrait>
+  using mixin_matching_trait = IndexTypePolicyMixin<IdxType, AnalyzeNextTrait>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="IndexTypePolicyMixin specializations"> {{{1
 
 // Index type given as IndexType template
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::IndexType<IntegralIndexType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin<Kokkos::IndexType<IntegralIndexType>,
+                            AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
 
-// IndexType given as an integral type directly
-template <class IntegralIndexType, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<std::is_integral<IntegralIndexType>::value>,
-    IntegralIndexType, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+// IndexType given as an integral type directly (the matcher already checks
+// this, so we don't have specialize to re-check it here)
+template <class IntegralIndexType, class AnalyzeNextTrait>
+struct IndexTypePolicyMixin : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
+  static constexpr auto show_index_type_error_in_compilation_message =
+      show_extra_index_type_erroneously_given_to_execution_policy<
+          std::conditional_t<base_t::index_type_is_defaulted, void,
+                             typename base_t::schedule_type>>{};
   static_assert(base_t::index_type_is_defaulted,
-                "Kokkos Error: More than one index type given");
+                "Kokkos Error: More than one index type given. Search "
+                "compiler output for 'show_extra_index_type' to see the "
+                "type of the errant tag.");
+  static_assert(std::is_integral<IntegralIndexType>::value, "");
   static constexpr bool index_type_is_defaulted = false;
   using index_type = Kokkos::IndexType<IntegralIndexType>;
 };
@@ -101,6 +118,22 @@ struct AnalyzeExecPolicy<
 // </editor-fold> end AnalyzeExecPolicy specializations }}}1
 //==============================================================================
 
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<IndexTypeTrait, IndexType<IntegralIndexType>>
+    : std::true_type {};
+
+template <class IntegralIndexType>
+struct PolicyTraitMatcher<
+    IndexTypeTrait, IntegralIndexType,
+    std::enable_if_t<std::is_integral<IntegralIndexType>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher specialization"> }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
index 30e07039a4..b05f3b29e9 100644
--- a/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_IterationPatternTrait.hpp
@@ -45,8 +45,11 @@
 #ifndef KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 #define KOKKOS_KOKKOS_ITERATIONPATTERNTRAIT_HPP
 
-#include <Kokkos_Concepts.hpp>  // is_iteration_pattern
-#include <type_traits>          // is_void
+#include <Kokkos_Concepts.hpp>                   // is_iteration_pattern
+#include <traits/Kokkos_PolicyTraitAdaptor.hpp>  // TraitSpecificationBase
+#include <Kokkos_Rank.hpp>                       // Rank
+#include <Kokkos_Layout.hpp>                     // Iterate
+#include <type_traits>                           // is_void
 
 namespace Kokkos {
 namespace Impl {
@@ -54,32 +57,42 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+    void> {};
 struct IterationPatternTrait : TraitSpecificationBase<IterationPatternTrait> {
   struct base_traits {
     using iteration_pattern = void;  // TODO set default iteration pattern
   };
-  template <class T>
-  using trait_matches_specification = is_iteration_pattern<T>;
+  template <class IterPattern, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    static constexpr auto show_iteration_pattern_error_in_compilation_message =
+        show_extra_iteration_pattern_erroneously_given_to_execution_policy<
+            typename base_t::iteration_pattern>{};
+    static_assert(
+        std::is_void<typename base_t::iteration_pattern>::value,
+        "Kokkos Error: More than one index type given. Search "
+        "compiler output for 'show_extra_iteration_pattern' to see the "
+        "type of the errant tag.");
+    using iteration_pattern = IterPattern;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class IterationPattern, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<is_iteration_pattern<IterationPattern>::value>,
-    IterationPattern, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::iteration_pattern>::value,
-                "Kokkos Error: More than one iteration pattern given");
-  using iteration_pattern = IterationPattern;
-};
+template <unsigned N, Iterate OuterDir, Iterate InnerDir>
+struct PolicyTraitMatcher<IterationPatternTrait, Rank<N, OuterDir, InnerDir>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end  }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
index 73ae8e27e2..06836bef8b 100644
--- a/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_LaunchBoundsTrait.hpp
@@ -62,29 +62,33 @@ struct LaunchBoundsTrait : TraitSpecificationBase<LaunchBoundsTrait> {
 
     using launch_bounds = LaunchBounds<>;
   };
-  template <class T>
-  using trait_matches_specification = is_launch_bounds<T>;
+  template <class LaunchBoundParam, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+
+    static constexpr bool launch_bounds_is_defaulted = false;
+
+    static_assert(base_t::launch_bounds_is_defaulted,
+                  "Kokkos Error: More than one launch_bounds given");
+
+    using launch_bounds = LaunchBoundParam;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <unsigned int MaxT, unsigned int MinB, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::LaunchBounds<MaxT, MinB>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::launch_bounds_is_defaulted,
-                "Kokkos Error: More than one launch_bounds given");
-  static constexpr bool launch_bounds_is_defaulted = false;
-  using launch_bounds = Kokkos::LaunchBounds<MaxT, MinB>;
-};
+template <unsigned int maxT, unsigned int minB>
+struct PolicyTraitMatcher<LaunchBoundsTrait, LaunchBounds<maxT, minB>>
+    : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
index 3deb4a94d5..73be14cf85 100644
--- a/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_OccupancyControlTrait.hpp
@@ -82,6 +82,9 @@ struct MaximizeOccupancy {
 
 namespace Impl {
 
+template <class Policy, class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin;
+
 //==============================================================================
 // <editor-fold desc="Occupancy control trait specification"> {{{1
 
@@ -94,6 +97,9 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
       return occupancy_control{};
     }
   };
+  template <class OccControl, class AnalyzeNextTrait>
+  using mixin_matching_trait =
+      OccupancyControlPolicyMixin<OccControl, AnalyzeNextTrait>;
   template <class T>
   using trait_matches_specification = std::integral_constant<
       bool,
@@ -105,39 +111,33 @@ struct OccupancyControlTrait : TraitSpecificationBase<OccupancyControlTrait> {
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="OccupancyControlPolicyMixin specializations"> {{{1
 
-// The DesiredOccupancy case has runtime storage, so we need to handle copies
-// and assignments
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
- public:
-  using base_t            = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::DesiredOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t            = AnalyzeNextTrait;
   using occupancy_control = Kokkos::Experimental::DesiredOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = true;
 
-  template <class OccControl>
-  using with_occupancy_control = AnalyzeExecPolicy<void, OccControl, Traits...>;
-
   // Treat this as private, but make it public so that MSVC will still treat
   // this as a standard layout class and make it the right size: storage for a
   // stateful desired occupancy
   //   private:
-  occupancy_control m_desired_occupancy;
+  occupancy_control m_desired_occupancy = occupancy_control{};
 
-  AnalyzeExecPolicy() = default;
+  OccupancyControlPolicyMixin() = default;
   // Converting constructor
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy(ExecPolicyTraitsWithDefaults<Other> const& other)
+  OccupancyControlPolicyMixin(ExecPolicyTraitsWithDefaults<Other> const& other)
       : base_t(other),
         m_desired_occupancy(other.impl_get_occupancy_control()) {}
 
   // Converting assignment operator
   // Just rely on the convertibility of occupancy_control to transfer the data
   template <class Other>
-  AnalyzeExecPolicy& operator=(
+  OccupancyControlPolicyMixin& operator=(
       ExecPolicyTraitsWithDefaults<Other> const& other) {
     *static_cast<base_t*>(this) = other;
     this->impl_set_desired_occupancy(
@@ -160,16 +160,16 @@ struct AnalyzeExecPolicy<void, Kokkos::Experimental::DesiredOccupancy,
   }
 };
 
-template <class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Experimental::MaximizeOccupancy,
-                         Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
+template <class AnalyzeNextTrait>
+struct OccupancyControlPolicyMixin<Kokkos::Experimental::MaximizeOccupancy,
+                                   AnalyzeNextTrait> : AnalyzeNextTrait {
+  using base_t = AnalyzeNextTrait;
   using base_t::base_t;
   using occupancy_control = Kokkos::Experimental::MaximizeOccupancy;
   static constexpr bool experimental_contains_desired_occupancy = false;
 };
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end OccupancyControlPolicyMixin specializations }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
index b087dac855..e500dd4e83 100644
--- a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitAdaptor.hpp
@@ -73,7 +73,7 @@ namespace Impl {
 // something that we can default to in the unspecialized case, just like we
 // do for AnalyzeExecPolicy
 template <class TraitSpec, class Trait, class Enable = void>
-struct PolicyTraitMatcher;
+struct PolicyTraitMatcher : std::false_type {};
 
 template <class TraitSpec, class Trait>
 struct PolicyTraitMatcher<
diff --git a/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
new file mode 100644
index 0000000000..31927320bf
--- /dev/null
+++ b/lib/kokkos/core/src/traits/Kokkos_PolicyTraitMatcher.hpp
@@ -0,0 +1,77 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>  // type_list
+
+#include <traits/Kokkos_Traits_fwd.hpp>
+
+#ifndef KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+#define KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+//==============================================================================
+// <editor-fold desc="PolicyTraitMatcher"> {{{1
+
+// To handle the WorkTag case, we need more than just a predicate; we need
+// something that we can default to in the unspecialized case, just like we
+// do for AnalyzeExecPolicy
+template <class TraitSpec, class Trait, class Enable = void>
+struct PolicyTraitMatcher : std::false_type {};
+
+template <class TraitSpec, class Trait>
+struct PolicyTraitMatcher<
+    TraitSpec, Trait,
+    std::enable_if_t<
+        TraitSpec::template trait_matches_specification<Trait>::value>>
+    : std::true_type {};
+
+// </editor-fold> end PolicyTraitMatcher }}}1
+//==============================================================================
+
+}  // end namespace Impl
+}  // end namespace Kokkos
+
+#endif  // KOKKOS_KOKKOS_POLICYTRAITMATCHER_HPP
diff --git a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
index 74bab6fce2..3e578f9060 100644
--- a/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_ScheduleTrait.hpp
@@ -57,34 +57,43 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_schedule_type_erroneously_given_to_execution_policy<void> {};
 struct ScheduleTrait : TraitSpecificationBase<ScheduleTrait> {
   struct base_traits {
     static constexpr auto schedule_type_is_defaulted = true;
 
     using schedule_type = Schedule<Static>;
   };
-  template <class T>
-  using trait_matches_specification = is_schedule_type<T>;
+  template <class Sched, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using schedule_type = Sched;
+    static constexpr auto show_schedule_type_error_in_compilation_message =
+        show_extra_schedule_type_erroneously_given_to_execution_policy<
+            std::conditional_t<base_t::schedule_type_is_defaulted, void,
+                               typename base_t::schedule_type>>{};
+    static_assert(base_t::schedule_type_is_defaulted,
+                  "Kokkos Error: More than one schedule type given. Search "
+                  "compiler output for 'show_extra_schedule_type' to see the "
+                  "type of the errant tag.");
+    static constexpr bool schedule_type_is_defaulted = false;
+  };
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
 //==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
+// <editor-fold desc="PolicyTraitMatcher specialization"> {{{1
 
-template <class ScheduleType, class... Traits>
-struct AnalyzeExecPolicy<void, Kokkos::Schedule<ScheduleType>, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(base_t::schedule_type_is_defaulted,
-                "Kokkos Error: More than one schedule type given");
-  static constexpr bool schedule_type_is_defaulted = false;
-  using schedule_type = Kokkos::Schedule<ScheduleType>;
-};
+template <class Sched>
+struct PolicyTraitMatcher<ScheduleTrait, Schedule<Sched>> : std::true_type {};
 
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
+// </editor-fold> end PolicyTraitMatcher specialization }}}1
 //==============================================================================
 
 }  // end namespace Impl
diff --git a/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp b/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
index b8b9a0ca2d..b8289ca618 100644
--- a/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_Traits_fwd.hpp
@@ -51,9 +51,15 @@ namespace Impl {
 template <class Enable, class... TraitsList>
 struct AnalyzeExecPolicy;
 
+template <class Enable, class TraitSpecList, class... Traits>
+struct AnalyzeExecPolicyUseMatcher;
+
 template <class AnalysisResults>
 struct ExecPolicyTraitsWithDefaults;
 
+template <class TraitSpec, class Trait, class Enable>
+struct PolicyTraitMatcher;
+
 template <class TraitSpec, template <class...> class PolicyTemplate,
           class AlreadyProcessedList, class ToProcessList, class NewTrait,
           class Enable = void>
@@ -67,6 +73,40 @@ struct PolicyTraitAdaptor;
 // traits
 struct dependent_policy_trait_default;
 
+//==============================================================================
+// <editor-fold desc="Execution policy trait specifications"> {{{1
+
+struct ExecutionSpaceTrait;
+struct IndexTypeTrait;
+struct ScheduleTrait;
+struct IterationPatternTrait;
+struct WorkItemPropertyTrait;
+struct LaunchBoundsTrait;
+struct OccupancyControlTrait;
+struct GraphKernelTrait;
+struct WorkTagTrait;
+
+// Keep these sorted by frequency of use to reduce compilation time
+//
+// clang-format off
+using execution_policy_trait_specifications =
+  type_list<
+    ExecutionSpaceTrait,
+    IndexTypeTrait,
+    ScheduleTrait,
+    IterationPatternTrait,
+    WorkItemPropertyTrait,
+    LaunchBoundsTrait,
+    OccupancyControlTrait,
+    GraphKernelTrait,
+    // This one has to be last, unfortunately:
+    WorkTagTrait
+  >;
+// clang-format on
+
+// </editor-fold> end Execution policy trait specifications }}}1
+//==============================================================================
+
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
index 2656316fb9..35671d19b0 100644
--- a/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_WorkItemPropertyTrait.hpp
@@ -60,6 +60,12 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
   struct base_traits {
     using work_item_property = Kokkos::Experimental::WorkItemProperty::None_t;
   };
+  template <class WorkItemProp, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_item_property = WorkItemProp;
+  };
   template <class T>
   using trait_matches_specification =
       Kokkos::Experimental::is_work_item_property<T>;
@@ -68,26 +74,6 @@ struct WorkItemPropertyTrait : TraitSpecificationBase<WorkItemPropertyTrait> {
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-template <class Property, class... Traits>
-struct AnalyzeExecPolicy<
-    std::enable_if_t<
-        Kokkos::Experimental::is_work_item_property<Property>::value>,
-    Property, Traits...> : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(
-      std::is_same<typename base_t::work_item_property,
-                   Kokkos::Experimental::WorkItemProperty::None_t>::value,
-      "Kokkos Error: More than one work item property given");
-  using work_item_property = Property;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 
 namespace Experimental {
diff --git a/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp b/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
index 877005756a..424e5c405b 100644
--- a/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
+++ b/lib/kokkos/core/src/traits/Kokkos_WorkTagTrait.hpp
@@ -49,6 +49,7 @@
 #include <Kokkos_Concepts.hpp>  // is_execution_space
 #include <traits/Kokkos_PolicyTraitAdaptor.hpp>
 #include <traits/Kokkos_Traits_fwd.hpp>
+#include <impl/Kokkos_Utilities.hpp>  // type_list_any, type_list_remove_first
 
 namespace Kokkos {
 namespace Impl {
@@ -56,68 +57,65 @@ namespace Impl {
 //==============================================================================
 // <editor-fold desc="trait specification"> {{{1
 
+template <class T>
+struct show_extra_work_tag_erroneously_given_to_execution_policy;
+template <>
+struct show_extra_work_tag_erroneously_given_to_execution_policy<void> {};
+
+using _exec_policy_traits_without_work_tag = typename type_list_remove_first<
+    WorkTagTrait, execution_policy_trait_specifications>::type;
+
+template <class Trait>
+struct _trait_matches_spec_predicate {
+  template <class TraitSpec>
+  struct apply {
+    using type = typename PolicyTraitMatcher<TraitSpec, Trait>::type;
+    static constexpr bool value = type::value;
+  };
+};
+
 struct WorkTagTrait : TraitSpecificationBase<WorkTagTrait> {
   struct base_traits {
     using work_tag = void;
   };
+  template <class WorkTag, class AnalyzeNextTrait>
+  struct mixin_matching_trait : AnalyzeNextTrait {
+    using base_t = AnalyzeNextTrait;
+    using base_t::base_t;
+    using work_tag = WorkTag;
+    static constexpr auto show_work_tag_error_in_compilation_message =
+        show_extra_work_tag_erroneously_given_to_execution_policy<
+            typename base_t::work_tag>{};
+    static_assert(
+        std::is_void<typename base_t::work_tag>::value,
+        "Kokkos Error: More than one work tag given. Search compiler output "
+        "for 'show_extra_work_tag' to see the type of the errant tag.");
+  };
+  // Since we don't have subsumption in pre-C++20, we need to have the work tag
+  // "trait" handling code ensure that none of the other conditions are met.
+  // * Compile time cost complexity note: at first glance it looks like this
+  //   "rechecks" all of the other trait specs when used in the context of the
+  //   full list of execution policy traits, but actually since we've already
+  //   checked all of them to get to the end of the list, the compiler will
+  //   have already generated those definitions, so there should be little extra
+  //   cost to this. However, in the scenario where we use work tag in isolation
+  //   (like if we were to add a `require()`-like thing that changes the work
+  //   tag of an existing execution policy instance), we need to check all of
+  //   the other traits to make sure that we're not replacing something else,
+  //   given that the concept of a work tag is basically unconstrained and could
+  //   be anything.  This should still be as efficient at compile time as the
+  //   old code that just did a big long series of nested std::conditionals, but
+  //   we should benchmark this assumption if it becomes a problem.
+  template <class T>
+  using trait_matches_specification = std::integral_constant<
+      bool, !std::is_void<T>::value &&
+                !type_list_any<_trait_matches_spec_predicate<T>::template apply,
+                               _exec_policy_traits_without_work_tag>::value>;
 };
 
 // </editor-fold> end trait specification }}}1
 //==============================================================================
 
-//==============================================================================
-// <editor-fold desc="AnalyzeExecPolicy specializations"> {{{1
-
-// Since we don't have subsumption in pre-C++20, we need to have the work tag
-// "trait" handling code be unspecialized, so we handle it instead in a class
-// with a different name.
-template <class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-};
-
-template <class WorkTag, class... Traits>
-struct AnalyzeExecPolicyHandleWorkTag<WorkTag, Traits...>
-    : AnalyzeExecPolicy<void, Traits...> {
-  using base_t = AnalyzeExecPolicy<void, Traits...>;
-  using base_t::base_t;
-  static_assert(std::is_void<typename base_t::work_tag>::value,
-                "Kokkos Error: More than one work tag given");
-  using work_tag = WorkTag;
-};
-
-// This only works if this is not a partial specialization, so we have to
-// do the partial specialization elsewhere
-template <class Enable, class... Traits>
-struct AnalyzeExecPolicy : AnalyzeExecPolicyHandleWorkTag<Traits...> {
-  using base_t = AnalyzeExecPolicyHandleWorkTag<Traits...>;
-  using base_t::base_t;
-};
-
-// </editor-fold> end AnalyzeExecPolicy specializations }}}1
-//==============================================================================
-
-//==============================================================================
-// <editor-fold desc="PolicyTraitMatcher specializations"> {{{1
-
-// In order to match the work tag trait the work tag "matcher" needs to be
-// unspecialized and the logic needs to be handled in a differently-named class,
-// just like above.
-template <class TraitSpec, class Trait>
-struct PolicyTraitMatcherHandleWorkTag : std::false_type {};
-
-template <class Trait>
-struct PolicyTraitMatcherHandleWorkTag<WorkTagTrait, Trait>
-    : std::integral_constant<bool, !std::is_void<Trait>::value> {};
-
-template <class TraitSpec, class Trait, class Enable>
-struct PolicyTraitMatcher /* unspecialized! */
-    : PolicyTraitMatcherHandleWorkTag<TraitSpec, Trait> {};
-
-// </editor-fold> end PolicyTraitMatcher specializations }}}1
-//==============================================================================
-
 }  // end namespace Impl
 }  // end namespace Kokkos
 
diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt
index 5826208851..89b8ff1e4f 100644
--- a/lib/kokkos/core/unit_test/CMakeLists.txt
+++ b/lib/kokkos/core/unit_test/CMakeLists.txt
@@ -41,10 +41,10 @@ SET(KOKKOS_OPENMP_FEATURE_LEVEL 999)
 SET(KOKKOS_OPENMP_NAME OpenMP)
 
 # FIXME_OPENMPTARGET - The NVIDIA HPC compiler nvc++ only compiles the first 8 incremental tests for the OpenMPTarget backend.
-IF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 8)
+IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 10)
 ELSE()
-  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 13)
+  SET(KOKKOS_OPENMPTARGET_FEATURE_LEVEL 14)
 ENDIF()
 
 SET(KOKKOS_OPENMPTARGET_NAME Experimental::OpenMPTarget)
@@ -65,6 +65,21 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
 KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files)
 
+SET(COMPILE_ONLY_SOURCES
+  TestDetectionIdiom.cpp
+  TestInterOp.cpp
+  TestTypeList.cpp
+)
+# TestInterOp has a dependency on containers
+IF(KOKKOS_HAS_TRILINOS)
+  LIST(REMOVE_ITEM COMPILE_ONLY_SOURCES TestInterOp.cpp)
+ENDIF()
+KOKKOS_ADD_EXECUTABLE(
+  TestCompileOnly
+  SOURCES
+  ${COMPILE_ONLY_SOURCES}
+)
+
 foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   # Because there is always an exception to the rule
   if(Tag STREQUAL "Threads")
@@ -98,6 +113,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         Complex
         Crs
         DeepCopyAlignment
+        ExecutionSpace
         FunctorAnalysis
         Init
         LocalDeepCopy
@@ -107,6 +123,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
         MDRange_c
         HostSharedPtr
         HostSharedPtrAccessOnDevice
+        QuadPrecisionMath
+        ExecSpacePartitioning
+        MathematicalSpecialFunctions
         )
       set(file ${dir}/Test${Tag}_${Name}.cpp)
       # Write to a temporary intermediate file and call configure_file to avoid
@@ -190,7 +209,7 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
     elseif(Tag STREQUAL "HIP")
       set(TagHostAccessible HIPHostPinned)
     elseif(Tag STREQUAL "SYCL")
-      set(TagHostAccessible SYCLSharedUSMSpace)
+      set(TagHostAccessible SYCLSharedUSM)
     endif()
 
     set(${Tag}_SOURCES2B)
@@ -257,6 +276,43 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;OpenMPTarget;HIP;SYCL)
   endif()
 endforeach()
 
+foreach(PairDeviceSpace HIP-HostPinned;Cuda-HostPinned;Cuda-UVM;SYCL-HostUSM;SYCL-SharedUSM)
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\1" DEVICE ${PairDeviceSpace})
+  string(REGEX REPLACE "([^-]*)-(.*)" "\\2" SPACE ${PairDeviceSpace})
+
+  string(TOUPPER ${DEVICE} UPPER_DEVICE)
+  string(TOLOWER ${DEVICE} dir)
+
+  if(Kokkos_ENABLE_${UPPER_DEVICE})
+    set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir})
+    file(MAKE_DIRECTORY ${dir})
+    foreach(Name
+      SharedAlloc
+      ViewAPI_a
+      ViewAPI_b
+      ViewAPI_c
+      ViewAPI_d
+      ViewAPI_e
+      ViewCopy_a
+      ViewCopy_b
+      ViewMapping_a
+      ViewMapping_b
+      ViewMapping_subview
+      )
+      set(file ${dir}/Test${DEVICE}${SPACE}_${Name}.cpp)
+      # Write to a temporary intermediate file and call configure_file to avoid
+      # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs.
+      file(WRITE ${dir}/dummy.cpp
+          "#include <Test${DEVICE}${SPACE}_Category.hpp>\n"
+          "#include <Test${Name}.hpp>\n"
+      )
+      configure_file(${dir}/dummy.cpp ${file})
+      list(APPEND ${DEVICE}_SOURCES3 ${file})
+    endforeach()
+    list(APPEND ${DEVICE}_SOURCES ${${DEVICE}_SOURCES3})
+  endif()
+endforeach()
+
 if(Kokkos_ENABLE_OPENMPTARGET)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_AtomicOperations_complexfloat.cpp
@@ -264,9 +320,7 @@ if(Kokkos_ENABLE_OPENMPTARGET)
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Crs.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_LocalDeepCopy.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Other.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamReductionScan.cpp
-    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamScan.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewAPI_e.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_ViewCopy_a.cpp
@@ -278,9 +332,16 @@ if(Kokkos_ENABLE_OPENMPTARGET)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-IF(KOKKOS_ENABLE_OPENMPTARGET
-   AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+IF(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   list(REMOVE_ITEM OpenMPTarget_SOURCES
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_int64_t_reduce_dynamic_view.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_double_reduce_dynamic.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamTeamSize.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_Reductions_DeviceView.cpp
+    ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_TeamVectorRange.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_UniqueToken.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtr.cpp
     ${CMAKE_CURRENT_BINARY_DIR}/openmptarget/TestOpenMPTarget_HostSharedPtrAccessOnDevice.cpp
@@ -370,14 +431,19 @@ if(Kokkos_ENABLE_PTHREAD)
   )
 endif()
 
-if(Kokkos_ENABLE_OPENMP)
+if (Kokkos_ENABLE_OPENMP)
+  set(OpenMP_EXTRA_SOURCES
+    openmp/TestOpenMP_Task.cpp
+  )
+  if (Kokkos_ENABLE_DEPRECATED_CODE_3)
+    list(APPEND OpenMP_EXTRA_SOURCES openmp/TestOpenMP_Task.cpp)
+  endif ()
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMP
     SOURCES
     UnitTestMainInit.cpp
     ${OpenMP_SOURCES}
-    openmp/TestOpenMP_PartitionMaster.cpp
-    openmp/TestOpenMP_Task.cpp
+    ${OpenMP_EXTRA_SOURCES}
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_OpenMPInterOp
@@ -463,28 +529,7 @@ if(Kokkos_ENABLE_CUDA)
       UnitTestMainInit.cpp
       cuda/TestCuda_Task.cpp
       cuda/TestCuda_TeamScratchStreams.cpp
-      cuda/TestCudaHostPinned_SharedAlloc.cpp
-      cuda/TestCudaHostPinned_ViewAPI_a.cpp
-      cuda/TestCudaHostPinned_ViewAPI_b.cpp
-      cuda/TestCudaHostPinned_ViewAPI_c.cpp
-      cuda/TestCudaHostPinned_ViewAPI_d.cpp
-      cuda/TestCudaHostPinned_ViewAPI_e.cpp
-      cuda/TestCudaHostPinned_ViewCopy_a.cpp
-      cuda/TestCudaHostPinned_ViewCopy_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_a.cpp
-      cuda/TestCudaHostPinned_ViewMapping_b.cpp
-      cuda/TestCudaHostPinned_ViewMapping_subview.cpp
-      cuda/TestCudaUVM_SharedAlloc.cpp
-      cuda/TestCudaUVM_ViewAPI_a.cpp
-      cuda/TestCudaUVM_ViewAPI_b.cpp
-      cuda/TestCudaUVM_ViewAPI_c.cpp
-      cuda/TestCudaUVM_ViewAPI_d.cpp
-      cuda/TestCudaUVM_ViewAPI_e.cpp
-      cuda/TestCudaUVM_ViewCopy_a.cpp
-      cuda/TestCudaUVM_ViewCopy_b.cpp
-      cuda/TestCudaUVM_ViewMapping_a.cpp
-      cuda/TestCudaUVM_ViewMapping_b.cpp
-      cuda/TestCudaUVM_ViewMapping_subview.cpp
+      ${Cuda_SOURCES3}
       cuda/TestCuda_Spaces.cpp
   )
 
@@ -524,17 +569,8 @@ if(Kokkos_ENABLE_HIP)
       ${HIP_SOURCES}
       hip/TestHIP_ScanUnit.cpp
       hip/TestHIP_TeamScratchStreams.cpp
-      hip/TestHIPHostPinned_ViewAPI_a.cpp
-      hip/TestHIPHostPinned_ViewAPI_b.cpp
-      hip/TestHIPHostPinned_ViewAPI_c.cpp
-      hip/TestHIPHostPinned_ViewAPI_d.cpp
-      hip/TestHIPHostPinned_ViewAPI_e.cpp
-      hip/TestHIPHostPinned_ViewCopy_a.cpp
-      hip/TestHIPHostPinned_ViewCopy_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_a.cpp
-      hip/TestHIPHostPinned_ViewMapping_b.cpp
-      hip/TestHIPHostPinned_ViewMapping_subview.cpp
       hip/TestHIP_AsyncLauncher.cpp
+      hip/TestHIP_BlocksizeDeduction.cpp
   )
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_HIPInterOpInit
@@ -595,13 +631,25 @@ if(Kokkos_ENABLE_SYCL)
       ${SYCL_SOURCES2C}
   )
 
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCL2D
     SOURCES
       UnitTestMainInit.cpp
       ${SYCL_SOURCES2D}
   )
- KOKKOS_ADD_EXECUTABLE_AND_TEST(
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_SYCL3
+    SOURCES
+      UnitTestMainInit.cpp
+      # FIXME_SYCL
+      sycl/TestSYCL_Task.cpp
+      sycl/TestSYCL_TeamScratchStreams.cpp
+      ${SYCL_SOURCES3}
+      sycl/TestSYCL_Spaces.cpp
+  )
+
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
     UnitTest_SYCLInterOpInit
     SOURCES
       UnitTestMain.cpp
@@ -622,8 +670,7 @@ if(Kokkos_ENABLE_SYCL)
 endif()
 
 # FIXME_OPENMPTARGET - Comment non-passing tests with the NVIDIA HPC compiler nvc++
-if (KOKKOS_ENABLE_OPENMPTARGET
-    AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
+if (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)
   SET(DEFAULT_DEVICE_SOURCES
     UnitTestMainInit.cpp
     default/TestDefaultDeviceType.cpp
@@ -685,11 +732,21 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
 )
 
   if(KOKKOS_ENABLE_TUNING)
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_TuningBuiltins
+      SOURCES
+      tools/TestBuiltinTuners.cpp
+    )
     KOKKOS_ADD_EXECUTABLE_AND_TEST(
       UnitTest_TuningBasics
       SOURCES
         tools/TestTuning.cpp
     )
+    KOKKOS_ADD_EXECUTABLE_AND_TEST(
+      UnitTest_CategoricalTuner
+      SOURCES
+      tools/TestCategoricalTuner.cpp
+    )
   endif()
   if(NOT Kokkos_ENABLE_OPENMPTARGET)
   KOKKOS_ADD_EXECUTABLE_AND_TEST(
@@ -698,6 +755,11 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       tools/TestLogicalSpaces.cpp
   )
   endif()
+  KOKKOS_ADD_EXECUTABLE_AND_TEST(
+    UnitTest_EventCorrectness
+    SOURCES
+    tools/TestEventCorrectness.cpp
+  )
   if(KOKKOS_ENABLE_LIBDL)
 
     KOKKOS_ADD_TEST_LIBRARY(
@@ -745,7 +807,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       TOOL kokkosprinter-tool
       ARGS --kokkos-tools-args="-c test delimit"
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
 
     # Above will test that leading/trailing quotes are stripped bc ctest cmd args is:
@@ -762,7 +824,7 @@ KOKKOS_ADD_ADVANCED_TEST( UnitTest_PushFinalizeHook_terminate
       EXE  ProfilingAllCalls
       ARGS [=[--kokkos-tools-args=-c test delimit]=]
             --kokkos-tools-library=$<TARGET_FILE:kokkosprinter-tool>
-      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source]:0:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination]:0:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
+      PASS_REGULAR_EXPRESSION "kokkosp_init_library::kokkosp_parse_args:4:KokkosCore_ProfilingAllCalls:-c:test:delimit::.*::kokkosp_allocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]source] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_allocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_begin_parallel_for:Kokkos::View::initialization [[]destination] via memset:[0-9]+:0::kokkosp_end_parallel_for:0::kokkosp_begin_deep_copy:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_end_deep_copy::kokkosp_begin_parallel_for:parallel_for:${SIZE_REGEX}:0::kokkosp_end_parallel_for:0::kokkosp_begin_parallel_reduce:parallel_reduce:${SIZE_REGEX}:1${SKIP_SCRATCH_INITIALIZATION_REGEX}::kokkosp_end_parallel_reduce:1::kokkosp_begin_parallel_scan:parallel_scan:${SIZE_REGEX}:2::kokkosp_end_parallel_scan:2::kokkosp_push_profile_region:push_region::kokkosp_pop_profile_region::kokkosp_create_profile_section:created_section:3::kokkosp_start_profile_section:3::kokkosp_stop_profile_section:3::kokkosp_destroy_profile_section:3::kokkosp_profile_event:profiling_event::kokkosp_declare_metadata:dogs:good::kokkosp_deallocate_data:${MEMSPACE_REGEX}:destination:${ADDRESS_REGEX}:40::kokkosp_deallocate_data:${MEMSPACE_REGEX}:source:${ADDRESS_REGEX}:40::kokkosp_finalize_library::"
     )
   endif() #KOKKOS_ENABLE_LIBDL
 if(NOT KOKKOS_HAS_TRILINOS)
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
index 390fc79a47..4226282214 100644
--- a/lib/kokkos/core/unit_test/Makefile
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -73,6 +73,8 @@ tmp := $(foreach device, $(KOKKOS_DEVICELIST), \
   ) \
 )
 
+GPU_SPACE_TESTS = SharedAlloc ViewAPI_a ViewAPI_b ViewAPI_c ViewAPI_d ViewAPI_e ViewCopy_a ViewCopy_b ViewMapping_a ViewMapping_b ViewMapping_subview
+
 SUBVIEW_TESTS = SubView_a SubView_b SubView_c01 SubView_c02 SubView_c03 SubView_c04 SubView_c05 SubView_c06 SubView_c07 SubView_c08 SubView_c09 SubView_c10 SubView_c11 SubView_c12 SubView_c13
 
 KOKKOS_SUBVIEW_DEVICELIST := $(filter-out Cuda, $(KOKKOS_DEVICELIST))
@@ -94,6 +96,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
       )\
     )
 
+    GPU_SPACES = CudaHostPinned CudaUVM
+    tmp := $(foreach space, $(GPU_SPACES), \
+      tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+        $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+          $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+          $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+        )\
+      )\
+    )
+
     OBJ_CUDA = UnitTestMainInit.o gtest-all.o
     OBJ_CUDA += TestCuda_Init.o
     OBJ_CUDA += TestCuda_SharedAlloc.o TestCudaUVM_SharedAlloc.o TestCudaHostPinned_SharedAlloc.o
@@ -261,6 +273,16 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 endif
 
 ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
+	GPU_SPACES = HIPHostPinned
+	tmp := $(foreach space, $(GPU_SPACES), \
+	  tmp2 := $(foreach test, $(GPU_SPACE_TESTS), \
+	    $(if $(filter Test$(space)_$(test).cpp, $(shell ls Test$(space)_$(test).cpp 2>/dev/null)),,\
+	      $(shell echo "\#include <Test$(space)_Category.hpp>" > Test$(space)_$(test).cpp); \
+	      $(shell echo "\#include <Test"$(test)".hpp>" >> Test$(space)_$(test).cpp); \
+	    )\
+	  )\
+	)
+
 	OBJ_HIP = UnitTestMainInit.o gtest-all.o
 	OBJ_HIP += TestHIP_Init.o
 	OBJ_HIP += TestHIP_Reducers_a.o TestHIP_Reducers_b.o TestHIP_Reducers_c.o TestHIP_Reducers_d.o
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
index 04362125c0..257ad2e9e5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations.hpp
@@ -81,6 +81,56 @@ struct InitFunctor {
   InitFunctor(T _init_value) : init_value(_init_value) {}
 };
 
+//---------------------------------------------------
+//--------------atomic_load/store/assign---------------------
+//---------------------------------------------------
+#ifdef KOKKOS_ENABLE_IMPL_DESUL_ATOMICS
+template <class T, class DEVICE_TYPE>
+struct LoadStoreFunctor {
+  using execution_space = DEVICE_TYPE;
+  using type            = Kokkos::View<T, execution_space>;
+
+  type data;
+  T i0;
+  T i1;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    T old = Kokkos::atomic_load(&data());
+    if (old != i0)
+      Kokkos::abort("Kokkos Atomic Load didn't get the right value");
+    Kokkos::atomic_store(&data(), i1);
+    Kokkos::atomic_assign(&data(), old);
+  }
+  LoadStoreFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
+};
+#endif
+
+template <class T, class DeviceType>
+bool LoadStoreAtomicTest(T i0, T i1) {
+  using execution_space = typename DeviceType::execution_space;
+  struct InitFunctor<T, execution_space> f_init(i0);
+  typename InitFunctor<T, execution_space>::type data("Data");
+  typename InitFunctor<T, execution_space>::h_type h_data("HData");
+
+  f_init.data = data;
+  Kokkos::parallel_for(1, f_init);
+  execution_space().fence();
+
+#ifdef KOKKOS_ENABLE_DESUL_ATOMICS
+  struct LoadStoreFunctor<T, execution_space> f(i0, i1);
+
+  f.data = data;
+  Kokkos::parallel_for(1, f);
+#else
+  h_data() = i1;
+#endif
+
+  Kokkos::deep_copy(h_data, data);
+
+  return h_data() == i0;
+}
+
 //---------------------------------------------------
 //--------------atomic_fetch_max---------------------
 //---------------------------------------------------
@@ -594,7 +644,10 @@ struct AndFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_and(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_and(&data(), (T)i1);
+    Kokkos::atomic_and(&data(), result);
+  }
 
   AndFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -665,7 +718,10 @@ struct OrFunctor {
   T i1;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(int) const { Kokkos::atomic_fetch_or(&data(), (T)i1); }
+  void operator()(int) const {
+    T result = Kokkos::atomic_fetch_or(&data(), (T)i1);
+    Kokkos::atomic_or(&data(), result);
+  }
 
   OrFunctor(T _i0, T _i1) : i0(_i0), i1(_i1) {}
 };
@@ -954,6 +1010,7 @@ bool AtomicOperationsTestIntegralType(int i0, int i1, int test) {
     case 10: return RShiftAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 11: return IncAtomicTest<T, DeviceType>((T)i0);
     case 12: return DecAtomicTest<T, DeviceType>((T)i0);
+    case 13: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
@@ -966,6 +1023,7 @@ bool AtomicOperationsTestNonIntegralType(int i0, int i1, int test) {
     case 2: return MinAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 3: return MulAtomicTest<T, DeviceType>((T)i0, (T)i1);
     case 4: return DivAtomicTest<T, DeviceType>((T)i0, (T)i1);
+    case 5: return LoadStoreAtomicTest<T, DeviceType>((T)i0, (T)i1);
   }
 
   return 0;
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
index ba9937e1c6..303f5b6eb9 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_double.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_double) {
                  double, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  double, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 double, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
index aa56b5ff10..d3d4916b4e 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_float.hpp
@@ -57,6 +57,8 @@ TEST(TEST_CATEGORY, atomic_operations_float) {
                  float, TEST_EXECSPACE>(start, end - i, 3)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
                  float, TEST_EXECSPACE>(start, end - i, 4)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestNonIntegralType<
+                 float, TEST_EXECSPACE>(start, end - i, 5)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
index f828be6223..e5f2f334fc 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_int.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_int) {
                  int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
index eee44c9571..d4fda70e80 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_longint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_long) {
                  long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
index 73d4a61d72..b7fb0cdae5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_longlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_longlong) {
                  long long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  long long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 long long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
index 02f337c57c..c3c6bc9fb3 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsigned) {
                  unsigned int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
index f4340475f5..f3be4bedb7 100644
--- a/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicOperations_unsignedlongint.hpp
@@ -71,6 +71,8 @@ TEST(TEST_CATEGORY, atomic_operations_unsignedlong) {
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 11)));
     ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
                  unsigned long int, TEST_EXECSPACE>(start, end - i, 12)));
+    ASSERT_TRUE((TestAtomicOperations::AtomicOperationsTestIntegralType<
+                 unsigned long int, TEST_EXECSPACE>(start, end - i, 13)));
   }
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomicViews.hpp b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
index b615b407f3..e029ad81f5 100644
--- a/lib/kokkos/core/unit_test/TestAtomicViews.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomicViews.hpp
@@ -245,11 +245,11 @@ class TestAtomicViewAPI {
     ASSERT_EQ(ax.use_count(), size_t(4));
     ASSERT_EQ(const_ax.use_count(), ax.use_count());
 
-    ASSERT_FALSE(ax.data() == nullptr);
-    ASSERT_FALSE(const_ax.data() == nullptr);  // referenceable ptr
-    ASSERT_FALSE(unmanaged_ax.data() == nullptr);
-    ASSERT_FALSE(unmanaged_ax_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(ay.data() == nullptr);
+    ASSERT_NE(ax.data(), nullptr);
+    ASSERT_NE(const_ax.data(), nullptr);  // referenceable ptr
+    ASSERT_NE(unmanaged_ax.data(), nullptr);
+    ASSERT_NE(unmanaged_ax_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(ay.data(), nullptr);
     //    ASSERT_NE( ax, ay );
     //    Above test results in following runtime error from gtest:
     //    Expected: (ax) != (ay), actual: 32-byte object <30-01 D0-A0 D8-7F
@@ -278,7 +278,7 @@ class TestAtomicViewAPI {
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg_const,
       const Kokkos::View<const DataType, device,
                          Kokkos::MemoryTraits<Kokkos::Atomic> >& arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -290,8 +290,8 @@ class TestAtomicViewAPI {
     typeX x("X");
     const_typeX xc = x;
 
-    // ASSERT_TRUE( xc == x ); // const xc is referenceable, non-const x is not
-    // ASSERT_TRUE( x == xc );
+    // ASSERT_EQ( xc ,  x ); // const xc is referenceable, non-const x is not
+    // ASSERT_EQ( x ,  xc );
 
     check_auto_conversion_to_const(x, xc);
   }
diff --git a/lib/kokkos/core/unit_test/TestAtomics.hpp b/lib/kokkos/core/unit_test/TestAtomics.hpp
index e41ad5257d..f2993914a1 100644
--- a/lib/kokkos/core/unit_test/TestAtomics.hpp
+++ b/lib/kokkos/core/unit_test/TestAtomics.hpp
@@ -97,7 +97,7 @@ struct SuperScalar {
   }
 
   KOKKOS_INLINE_FUNCTION
-  SuperScalar operator+(const SuperScalar& src) {
+  SuperScalar operator+(const SuperScalar& src) const {
     SuperScalar tmp = *this;
     for (int i = 0; i < N; i++) {
       tmp.val[i] += src.val[i];
@@ -540,8 +540,6 @@ TEST(TEST_CATEGORY, atomics) {
 
 // FIXME_SYCL atomics for large types to be implemented
 #ifndef KOKKOS_ENABLE_SYCL
-  // FIXME_HIP HIP doesn't yet support atomics for >64bit types properly
-#ifndef KOKKOS_ENABLE_HIP
   ASSERT_TRUE(
       (TestAtomic::Loop<Kokkos::complex<double>, TEST_EXECSPACE>(1, 1)));
   ASSERT_TRUE(
@@ -567,7 +565,6 @@ TEST(TEST_CATEGORY, atomics) {
 #endif
 #endif
 #endif
-#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp
index b926058ebf..be0c1e50d7 100644
--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@@ -515,4 +515,44 @@ TEST(TEST_CATEGORY, complex_issue_3867) {
 #undef CHECK_POW_COMPLEX_PROMOTION
 }
 
+TEST(TEST_CATEGORY, complex_operations_arithmetic_types_overloads) {
+#define STATIC_ASSERT(cond) static_assert(cond, "")
+
+  STATIC_ASSERT(Kokkos::real(1) == 1.);
+  STATIC_ASSERT(Kokkos::real(2.f) == 2.f);
+  STATIC_ASSERT(Kokkos::real(3.) == 3.);
+  STATIC_ASSERT(Kokkos::real(4.l) == 4.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::real(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  STATIC_ASSERT(Kokkos::imag(1) == 0.);
+  STATIC_ASSERT(Kokkos::imag(2.f) == 0.f);
+  STATIC_ASSERT(Kokkos::imag(3.) == 0.);
+  STATIC_ASSERT(Kokkos::imag(4.l) == 0.l);
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(1)), double>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(2.f)), float>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::imag(3.)), double>::value));
+  STATIC_ASSERT(
+      (std::is_same<decltype(Kokkos::real(4.l)), long double>::value));
+
+  // FIXME in principle could be checked at compile time too
+  ASSERT_EQ(Kokkos::conj(1), Kokkos::complex<double>(1));
+  ASSERT_EQ(Kokkos::conj(2.f), Kokkos::complex<float>(2.f));
+  ASSERT_EQ(Kokkos::conj(3.), Kokkos::complex<double>(3.));
+  ASSERT_EQ(Kokkos::conj(4.l), Kokkos::complex<long double>(4.l));
+  STATIC_ASSERT((
+      std::is_same<decltype(Kokkos::conj(1)), Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(2.f)),
+                              Kokkos::complex<float>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(3.)),
+                              Kokkos::complex<double>>::value));
+  STATIC_ASSERT((std::is_same<decltype(Kokkos::conj(4.l)),
+                              Kokkos::complex<long double>>::value));
+
+#undef STATIC_ASSERT
+}
+
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp b/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
index 49f8daf89e..f487a015fb 100644
--- a/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
+++ b/lib/kokkos/core/unit_test/TestDeepCopyAlignment.hpp
@@ -296,7 +296,7 @@ struct TestDeepCopyScalarConversion {
 
     int64_t errors = 0;
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
 
     Kokkos::deep_copy(view_s1_1d, static_cast<Scalar1>(0));
     Kokkos::deep_copy(view_s1_2d, static_cast<Scalar1>(0));
@@ -306,7 +306,7 @@ struct TestDeepCopyScalarConversion {
                                              Kokkos::IndexType<int64_t>>(0, N0),
                          *this);
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors > 0);
+    ASSERT_GT(errors, 0);
 
     Kokkos::deep_copy(error_count, 0);
     Kokkos::deep_copy(TEST_EXECSPACE(), view_s1_1d, view_s2_1d);
@@ -318,7 +318,7 @@ struct TestDeepCopyScalarConversion {
                          *this);
 
     Kokkos::deep_copy(errors, error_count);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 };
 }  // namespace Impl
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index 8a9263c8df..90e485998e 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -79,7 +79,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
   int numa_idx    = (do_other ? 3 : 0) + (do_threads ? 1 : 0);
   int device_idx =
       (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0);
-  int tune_idx = (do_other ? 3 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
+  int tune_idx = (do_other ? 4 : 0) + (do_threads ? 1 : 0) + (do_numa ? 1 : 0) +
                  (do_device ? 1 : 0);
 
   if (do_threads) {
diff --git a/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
new file mode 100644
index 0000000000..f87fda6156
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDetectionIdiom.cpp
@@ -0,0 +1,96 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_DetectionIdiom.hpp>
+
+#define STATIC_ASSERT(cond) static_assert(cond, "");
+
+void test_nonesuch() {
+  using Kokkos::nonesuch;
+  STATIC_ASSERT(!std::is_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_destructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_copy_constructible<nonesuch>::value);
+  STATIC_ASSERT(!std::is_move_constructible<nonesuch>::value);
+#ifdef KOKKOS_ENABLE_CXX17
+  STATIC_ASSERT(!std::is_aggregate<nonesuch>::value);
+#endif
+}
+
+#undef STATIC_ASSERT
+
+namespace Example {
+// Example from https://en.cppreference.com/w/cpp/experimental/is_detected
+template <class T>
+using copy_assign_t = decltype(std::declval<T&>() = std::declval<const T&>());
+
+struct Meow {};
+struct Purr {
+  void operator=(const Purr&) = delete;
+};
+
+static_assert(Kokkos::is_detected<copy_assign_t, Meow>::value,
+              "Meow should be copy assignable!");
+static_assert(!Kokkos::is_detected<copy_assign_t, Purr>::value,
+              "Purr should not be copy assignable!");
+static_assert(Kokkos::is_detected_exact<Meow&, copy_assign_t, Meow>::value,
+              "Copy assignment of Meow should return Meow&!");
+
+template <class T>
+using diff_t = typename T::difference_type;
+
+template <class Ptr>
+using difference_type = Kokkos::detected_or_t<std::ptrdiff_t, diff_t, Ptr>;
+
+struct Woof {
+  using difference_type = int;
+};
+struct Bark {};
+
+static_assert(std::is_same<difference_type<Woof>, int>::value,
+              "Woof's difference_type should be int!");
+static_assert(std::is_same<difference_type<Bark>, std::ptrdiff_t>::value,
+              "Bark's difference_type should be ptrdiff_t!");
+}  // namespace Example
+
+int main() {}
diff --git a/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp b/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
new file mode 100644
index 0000000000..f8f5275d3d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestExecSpacePartitioning.hpp
@@ -0,0 +1,129 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+namespace {
+struct SumFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i, int& lsum) const { lsum += i; }
+};
+
+template <class ExecSpace>
+void check_distinctive(ExecSpace, ExecSpace) {}
+
+#ifdef KOKKOS_ENABLE_CUDA
+void check_distinctive(Kokkos::Cuda exec1, Kokkos::Cuda exec2) {
+  ASSERT_NE(exec1.cuda_stream(), exec2.cuda_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+void check_distinctive(Kokkos::Experimental::HIP exec1,
+                       Kokkos::Experimental::HIP exec2) {
+  ASSERT_NE(exec1.hip_stream(), exec2.hip_stream());
+}
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+void check_distinctive(Kokkos::Experimental::SYCL exec1,
+                       Kokkos::Experimental::SYCL exec2) {
+  ASSERT_NE(*exec1.impl_internal_space_instance()->m_queue,
+            *exec2.impl_internal_space_instance()->m_queue);
+}
+#endif
+}  // namespace
+
+void test_partitioning(std::vector<TEST_EXECSPACE>& instances) {
+  check_distinctive(instances[0], instances[1]);
+  int sum1, sum2;
+  int N = 3910;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[0], 0, N), SumFunctor(),
+      sum1);
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<TEST_EXECSPACE>(instances[1], 0, N), SumFunctor(),
+      sum2);
+  ASSERT_EQ(sum1, sum2);
+  ASSERT_EQ(sum1, N * (N - 1) / 2);
+
+#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \
+    defined(KOKKOS_ENABLE_SYCL)
+  // Eliminate unused function warning
+  // (i.e. when compiling for Serial and CUDA, during Serial compilation the
+  // Cuda overload is unused ...)
+  if (sum1 != sum2) {
+#ifdef KOKKOS_ENABLE_CUDA
+    check_distinctive(Kokkos::Cuda(), Kokkos::Cuda());
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+    check_distinctive(Kokkos::Experimental::HIP(), Kokkos::Experimental::HIP());
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+    check_distinctive(Kokkos::Experimental::SYCL(),
+                      Kokkos::Experimental::SYCL());
+#endif
+  }
+#endif
+}
+
+TEST(TEST_CATEGORY, partitioning_by_args) {
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), 1, 1.);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+
+TEST(TEST_CATEGORY, partitioning_by_vector) {
+  std::vector<int> weights{1, 1};
+  auto instances =
+      Kokkos::Experimental::partition_space(TEST_EXECSPACE(), weights);
+  ASSERT_EQ(int(instances.size()), 2);
+  test_partitioning(instances);
+}
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp
similarity index 68%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
rename to lib/kokkos/core/unit_test/TestExecutionSpace.hpp
index 26dc9b0e00..8e4331e809 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_c.cpp
+++ b/lib/kokkos/core/unit_test/TestExecutionSpace.hpp
@@ -42,5 +42,39 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace {
+
+struct StructCopy {
+  Kokkos::DefaultExecutionSpace device;
+  Kokkos::DefaultHostExecutionSpace host;
+};
+
+template <class ExecutionSpace>
+void check_struct_copy() {
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+  // FIXME_OPENMPTARGET nvlink error: Undefined reference to
+  // '_ZSt25__throw_bad_function_callv' in
+  // '/tmp/TestOpenMPTarget_ExecutionSpace-434d81.cubin'
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+  StructCopy data;
+  parallel_for(
+      Kokkos::RangePolicy<ExecutionSpace>(0, 1), KOKKOS_LAMBDA(int) {
+        StructCopy data2 = data;
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF("%i \n", data2.device.in_parallel());
+      });
+#endif
+#endif
+}
+
+}  // namespace
+
+TEST(TEST_CATEGORY, copy_structure) { check_struct_copy<TEST_EXECSPACE>(); }
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestHalfConversion.hpp b/lib/kokkos/core/unit_test/TestHalfConversion.hpp
index 277fb1b042..992f56cc6b 100644
--- a/lib/kokkos/core/unit_test/TestHalfConversion.hpp
+++ b/lib/kokkos/core/unit_test/TestHalfConversion.hpp
@@ -53,7 +53,7 @@ void test_half_conversion_type() {
   T base                         = static_cast<T>(3.3);
   Kokkos::Experimental::half_t a = Kokkos::Experimental::cast_to_half(base);
   T b                            = Kokkos::Experimental::cast_from_half<T>(a);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 
 // TODO: Remove ifndef once https://github.com/kokkos/kokkos/pull/3480 merges
 #ifndef KOKKOS_ENABLE_SYCL
@@ -67,7 +67,7 @@ void test_half_conversion_type() {
       });
 
   Kokkos::deep_copy(b, b_v);
-  ASSERT_TRUE((double(b - base) / double(base)) < epsilon);
+  ASSERT_LT((double(b - base) / double(base)), epsilon);
 #endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
 #endif  // KOKKOS_ENABLE_SYCL
 }
diff --git a/lib/kokkos/core/unit_test/TestHalfOperators.hpp b/lib/kokkos/core/unit_test/TestHalfOperators.hpp
index db52a05d5d..c4cf8a7457 100644
--- a/lib/kokkos/core/unit_test/TestHalfOperators.hpp
+++ b/lib/kokkos/core/unit_test/TestHalfOperators.hpp
@@ -269,6 +269,85 @@ enum OP_TESTS {
   N_OP_TESTS
 };
 
+template <class view_type>
+struct Functor_TestHalfVolatileOperators {
+  volatile half_t h_lhs, h_rhs;
+  view_type actual_lhs, expected_lhs;
+  double d_lhs, d_rhs;
+  Functor_TestHalfVolatileOperators(volatile half_t lhs = half_t(0),
+                                    volatile half_t rhs = half_t(0))
+      : h_lhs(lhs), h_rhs(rhs) {
+    actual_lhs   = view_type("actual_lhs", N_OP_TESTS);
+    expected_lhs = view_type("expected_lhs", N_OP_TESTS);
+    d_lhs        = cast_from_half<double>(h_lhs);
+    d_rhs        = cast_from_half<double>(h_rhs);
+    if (std::is_same<view_type, ViewTypeHost>::value) {
+      auto run_on_host = *this;
+      run_on_host(0);
+    } else {
+      Kokkos::parallel_for("Test::Functor_TestHalfVolatileOperators",
+                           Kokkos::RangePolicy<ExecutionSpace>(0, 1), *this);
+    }
+  }
+
+  KOKKOS_FUNCTION
+  void operator()(int) const {
+    volatile half_t tmp_lhs;
+
+    // Initialze output views to catch missing test invocations
+    for (int i = 0; i < N_OP_TESTS; ++i) {
+      actual_lhs(i)   = 1;
+      expected_lhs(i) = -1;
+    }
+
+    tmp_lhs              = h_lhs;
+    actual_lhs(ASSIGN)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(ASSIGN) = d_lhs;
+
+    actual_lhs(LT)   = h_lhs < h_rhs;
+    expected_lhs(LT) = d_lhs < d_rhs;
+
+    actual_lhs(LE)   = h_lhs <= h_rhs;
+    expected_lhs(LE) = d_lhs <= d_rhs;
+
+    actual_lhs(NEQ)   = h_lhs != h_rhs;
+    expected_lhs(NEQ) = d_lhs != d_rhs;
+
+    actual_lhs(GT)   = h_lhs > h_rhs;
+    expected_lhs(GT) = d_lhs > d_rhs;
+
+    actual_lhs(GE)   = h_lhs >= h_rhs;
+    expected_lhs(GE) = d_lhs >= d_rhs;
+
+    actual_lhs(EQ)   = h_lhs == h_rhs;
+    expected_lhs(EQ) = d_lhs == d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs += h_rhs;
+    actual_lhs(CADD_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CADD_H_H) = d_lhs;
+    expected_lhs(CADD_H_H) += d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs -= h_rhs;
+    actual_lhs(CSUB_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CSUB_H_H) = d_lhs;
+    expected_lhs(CSUB_H_H) -= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs *= h_rhs;
+    actual_lhs(CMUL_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CMUL_H_H) = d_lhs;
+    expected_lhs(CMUL_H_H) *= d_rhs;
+
+    tmp_lhs = h_lhs;
+    tmp_lhs /= h_rhs;
+    actual_lhs(CDIV_H_H)   = cast_from_half<double>(tmp_lhs);
+    expected_lhs(CDIV_H_H) = d_lhs;
+    expected_lhs(CDIV_H_H) /= d_rhs;
+  }
+};
+
 template <class view_type>
 struct Functor_TestHalfOperators {
   half_t h_lhs, h_rhs;
@@ -840,8 +919,33 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
                 epsilon);
   }
 
-  // Check whether half_t is trivially copyable
-  ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+  // Test partial volatile support
+  volatile half_t _h_lhs = h_lhs;
+  volatile half_t _h_rhs = h_rhs;
+  Functor_TestHalfVolatileOperators<ViewType> f_volatile_device(_h_lhs, _h_rhs);
+  Functor_TestHalfVolatileOperators<ViewTypeHost> f_volatile_host(_h_lhs,
+                                                                  _h_rhs);
+
+  ExecutionSpace().fence();
+  Kokkos::deep_copy(f_device_actual_lhs, f_device.actual_lhs);
+  Kokkos::deep_copy(f_device_expected_lhs, f_device.expected_lhs);
+  for (int op_test = 0; op_test < N_OP_TESTS; op_test++) {
+    // printf("op_test = %d\n", op_test);
+    if (op_test == ASSIGN || op_test == LT || op_test == LE || op_test == NEQ ||
+        op_test == EQ || op_test == GT || op_test == GE ||
+        op_test == CADD_H_H || op_test == CSUB_H_H || op_test == CMUL_H_H ||
+        op_test == CDIV_H_H) {
+      ASSERT_NEAR(f_device_actual_lhs(op_test), f_device_expected_lhs(op_test),
+                  epsilon);
+      ASSERT_NEAR(f_host.actual_lhs(op_test), f_host.expected_lhs(op_test),
+                  epsilon);
+    }
+  }
+
+  // is_trivially_copyable is false with the addition of explicit
+  // copy constructors that are required for supporting reductions
+  // ASSERT_TRUE(std::is_trivially_copyable<half_t>::value);
+
   constexpr size_t n       = 2;
   constexpr size_t n_bytes = sizeof(half_t) * n;
   const half_t h_arr0 = half_t(0x89ab), h_arr1 = half_t(0xcdef);
@@ -854,11 +958,11 @@ void __test_half_operators(half_t h_lhs, half_t h_rhs) {
   h_arr_ptr = reinterpret_cast<char*>(h_arr);
 
   std::memcpy(c_arr, h_arr, n_bytes);
-  for (i = 0; i < n_bytes; i++) ASSERT_TRUE(c_arr[i] == h_arr_ptr[i]);
+  for (i = 0; i < n_bytes; i++) ASSERT_EQ(c_arr[i], h_arr_ptr[i]);
 
   std::memcpy(h_arr, c_arr, n_bytes);
-  ASSERT_TRUE(h_arr[0] == h_arr0);
-  ASSERT_TRUE(h_arr[1] == h_arr1);
+  ASSERT_EQ(h_arr[0], h_arr0);
+  ASSERT_EQ(h_arr[1], h_arr1);
 }
 
 void test_half_operators() {
@@ -870,7 +974,6 @@ void test_half_operators() {
     // TODO: __test_half_operators(h_lhs + cast_to_half(i + 1), half_t(0));
     // TODO: __test_half_operators(half_t(0), h_rhs + cast_to_half(i));
   }
-  // TODO: __test_half_operators(0, 0);
 }
 
 TEST(TEST_CATEGORY, half_operators) { test_half_operators(); }
diff --git a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
index 18d1ac8518..10180251ba 100644
--- a/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
+++ b/lib/kokkos/core/unit_test/TestHostSharedPtrAccessOnDevice.hpp
@@ -52,14 +52,17 @@ using Kokkos::Impl::HostSharedPtr;
 namespace {
 
 class Data {
-  Kokkos::Array<char, 64> d;
+  char d[64];
 
  public:
-  KOKKOS_FUNCTION void write(char const* c) {
-    for (int i = 0; i < 64 && c; ++i, ++c) {
-      d[i] = *c;
-    }
+  // Because strncpy is not supported within device code
+  static KOKKOS_FUNCTION void my_strncpy(char* dst, const char* src,
+                                         size_t cnt) {
+    while (cnt-- > 0 && (*dst++ = *src++) != '\0')
+      ;
+    while (cnt-- > 0) *dst++ = '\0';
   }
+  KOKKOS_FUNCTION void write(char const* s) { my_strncpy(d, s, sizeof(d)); }
 };
 
 template <class SmartPtr>
@@ -154,3 +157,135 @@ TEST(TEST_CATEGORY, host_shared_ptr_special_members_on_device) {
   check_special_members_on_device(device_ptr);
 }
 #endif
+
+// FIXME_OPENMPTARGET
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) && \
+    !defined(KOKKOS_ENABLE_OPENMPTARGET)
+namespace {
+
+struct Bar {
+  double val;
+};
+
+struct Foo {
+  Foo(bool allocate = false) : ptr(allocate ? new Bar : nullptr) {}
+  Kokkos::Impl::HostSharedPtr<Bar> ptr;
+  int use_count() { return ptr.use_count(); }
+};
+
+template <class DevMemSpace, class HostMemSpace>
+void host_shared_ptr_test_reference_counting() {
+  using ExecSpace = typename DevMemSpace::execution_space;
+  bool is_gpu =
+      !Kokkos::SpaceAccessibility<ExecSpace, Kokkos::HostSpace>::accessible;
+
+  // Create two tracked instances
+  Foo f1(true), f2(true);
+  // Scope Views
+  {
+    Foo* fp_d_ptr =
+        static_cast<Foo*>(Kokkos::kokkos_malloc<DevMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, DevMemSpace> fp_d(fp_d_ptr);
+    // If using UVM or on the CPU don't make an extra HostCopy
+    Foo* fp_h_ptr = std::is_same<DevMemSpace, HostMemSpace>::value
+                        ? fp_d_ptr
+                        : static_cast<Foo*>(
+                              Kokkos::kokkos_malloc<HostMemSpace>(sizeof(Foo)));
+    Kokkos::View<Foo, HostMemSpace> fp_h(fp_h_ptr);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Just for the sake of it initialize the data of the host copy
+    new (fp_h.data()) Foo();
+    // placement new in kernel
+    //  if on GPU: should not increase use_count, fp_d will not be tracked
+    //  if on Host: refcount will increase fp_d is tracked
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { new (fp_d.data()) Foo(f1); });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // assignment operator on host, will increase f2 use_count
+    //   if default device is GPU: fp_h was untracked
+    //   if default device is CPU: fp_h was tracked and use_count was 2 for
+    //   aliasing f1, in which case use_count will be decreased here
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    Kokkos::deep_copy(fp_d, fp_h);
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // assignment in kernel:
+    //  If on GPU: should not increase use_count of f1 and fp_d will not be
+    //  tracked.
+    //  If on Host: use_count will increase of f1, fp_d is tracked,
+    //  use_count of f2 goes down.
+    //  Since we are messing with the use count on the device: make host copy
+    //  untracked first. Note if fp_d and fp_h alias each other (e.g. compiling
+    //  for CPU only) that means fp_d() will be untracked too during assignemnt
+    fp_h() = Foo();
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<ExecSpace>(0, 1),
+        KOKKOS_LAMBDA(int) { fp_d() = f1; });
+    Kokkos::fence();
+    Kokkos::deep_copy(fp_h, fp_d);
+
+    if (is_gpu)
+      ASSERT_EQ(1, f1.use_count());
+    else
+      ASSERT_EQ(2, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+
+    // Assign non-tracked ptr
+    //   if  if_gpu will not change use_count
+    //   if !is_gpu will decrease use_count of f1
+    fp_h() = Foo();
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(1, f2.use_count());
+    fp_h() = f2;
+    ASSERT_EQ(1, f1.use_count());
+    ASSERT_EQ(2, f2.use_count());
+
+    // before deleting host version make sure its not tracked
+    fp_h() = Foo();
+    if (fp_h_ptr != fp_d_ptr) Kokkos::kokkos_free<HostMemSpace>(fp_h_ptr);
+    Kokkos::kokkos_free<DevMemSpace>(fp_d_ptr);
+  }
+
+  ASSERT_EQ(1, f1.use_count());
+  ASSERT_EQ(1, f2.use_count());
+}
+}  // namespace
+
+TEST(TEST_CATEGORY, host_shared_ptr_tracking) {
+  host_shared_ptr_test_reference_counting<typename TEST_EXECSPACE::memory_space,
+                                          Kokkos::HostSpace>();
+#ifdef KOKKOS_ENABLE_CUDA
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+    host_shared_ptr_test_reference_counting<Kokkos::CudaUVMSpace,
+                                            Kokkos::CudaUVMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_SYCL
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::SYCLSharedUSMSpace,
+        Kokkos::Experimental::SYCLSharedUSMSpace>();
+#endif
+#ifdef KOKKOS_ENABLE_HIP
+  if (std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
+    host_shared_ptr_test_reference_counting<
+        Kokkos::Experimental::HIPHostPinnedSpace,
+        Kokkos::Experimental::HIPHostPinnedSpace>();
+#endif
+}
+
+#endif  // KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
diff --git a/lib/kokkos/core/unit_test/TestInterOp.cpp b/lib/kokkos/core/unit_test/TestInterOp.cpp
new file mode 100644
index 0000000000..7f08afada9
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestInterOp.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_DynRankView.hpp>
+#include <KokkosExp_InterOp.hpp>
+
+// View
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<double*>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View");
+
+// DynRankView
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<double>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView");
+
+// View + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<double*, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<
+            double*, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: View + Execution Space");
+
+// DynRankView + Execution Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::DynRankView<double, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::DynRankView<
+            double, typename Kokkos::DefaultExecutionSpace::array_layout,
+            typename Kokkos::DefaultExecutionSpace::memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Execution Space");
+
+// View + Memory space
+static_assert(std::is_same<Kokkos::Experimental::python_view_type_t<
+                               Kokkos::View<int64_t*, Kokkos::HostSpace>>,
+                           Kokkos::View<int64_t*, Kokkos::LayoutRight,
+                                        Kokkos::HostSpace>>::value,
+              "Error! Unexpected python_view_type for: View + Memory space");
+
+// DynRankView + Memory space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<
+                     Kokkos::DynRankView<int16_t, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<int16_t, Kokkos::LayoutRight,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Memory space");
+
+// View + Layout + Execution space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            int**, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+        Kokkos::View<int**, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultExecutionSpace::memory_space>>::
+        value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space");
+
+// DynRankView + Layout + Execution space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     int, Kokkos::LayoutLeft, Kokkos::DefaultExecutionSpace>>,
+                 Kokkos::DynRankView<int, Kokkos::LayoutLeft,
+                                     typename Kokkos::DefaultExecutionSpace::
+                                         memory_space>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space");
+
+// View + Layout + Memory Space
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<
+            Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+        Kokkos::View<uint32_t**, Kokkos::LayoutLeft, Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Memory Space");
+
+// DynRankView + Layout + Memory Space
+static_assert(
+    std::is_same<Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+                     uint64_t, Kokkos::LayoutLeft, Kokkos::HostSpace>>,
+                 Kokkos::DynRankView<uint64_t, Kokkos::LayoutLeft,
+                                     Kokkos::HostSpace>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Memory "
+    "Space");
+
+// View + Layout + Execution space + Memory Trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::View<
+            float***, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::RandomAccess>>>,
+        Kokkos::View<float***, Kokkos::LayoutLeft,
+                     typename Kokkos::DefaultHostExecutionSpace::memory_space,
+                     Kokkos::MemoryTraits<Kokkos::RandomAccess>>>::value,
+    "Error! Unexpected python_view_type for: View + Layout + Execution space + "
+    "Memory Trait");
+
+// DynRankView + Layout + Execution space  + Memory trait
+static_assert(
+    std::is_same<
+        Kokkos::Experimental::python_view_type_t<Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft, Kokkos::DefaultHostExecutionSpace,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>,
+        Kokkos::DynRankView<
+            float, Kokkos::LayoutLeft,
+            typename Kokkos::DefaultHostExecutionSpace::memory_space,
+            Kokkos::MemoryTraits<Kokkos::Atomic>>>::value,
+    "Error! Unexpected python_view_type for: DynRankView + Layout + Execution "
+    "space  + Memory trait");
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index 5618e40989..57461be714 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -2751,9 +2751,18 @@ struct TestMDRange_6D {
                            const int N3, const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2772,9 +2781,18 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2807,9 +2825,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2832,9 +2859,18 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar + label
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
 
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
@@ -2858,9 +2894,19 @@ struct TestMDRange_6D {
 
     // Test with reducers - scalar view
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type =
+          typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
+                                         Kokkos::IndexType<int>,
+                                         Kokkos::LaunchBounds<512, 1>>;
+#endif
 #ifdef KOKKOS_ENABLE_SYCL
       range_type range({{0, 0, 0, 0, 0, 0}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 2}});
@@ -2888,9 +2934,18 @@ struct TestMDRange_6D {
     // Test Min reducer with lambda
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       range_type range({{1, 1, 1, 1, 1, 1}}, {{N0, N1, N2, N3, N4, N5}},
                        {{3, 3, 3, 2, 2, 1}});
 
@@ -2923,9 +2978,19 @@ struct TestMDRange_6D {
 
     // Tagged operator test
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -2977,9 +3042,18 @@ struct TestMDRange_6D {
                         const int N4, const int N5) {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<128, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3028,8 +3102,16 @@ struct TestMDRange_6D {
 #endif
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>>;
+#endif
       using point_type = typename range_type::point_type;
 
       range_type range(point_type{{0, 0, 0, 0, 0, 0}},
@@ -3062,9 +3144,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>, InitTag>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>, InitTag>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3115,9 +3206,18 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3158,9 +3258,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Default, Iterate::Default>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3201,9 +3311,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3244,9 +3364,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Left, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3287,9 +3417,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Left>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3330,9 +3470,19 @@ struct TestMDRange_6D {
     }
 
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type = typename Kokkos::MDRangePolicy<
           ExecSpace, Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
           Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<512, 1>,
+          Kokkos::Rank<6, Iterate::Right, Iterate::Right>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
@@ -3683,9 +3833,18 @@ struct TestMDRange_6D_NegIdx {
   static void test_6D_negidx(const int N0, const int N1, const int N2,
                              const int N3, const int N4, const int N5) {
     {
+#if defined(KOKKOS_COMPILER_INTEL)
+      // Launchbounds causes hang with intel compilers
       using range_type =
           typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<6>,
                                          Kokkos::IndexType<int>>;
+#else
+      // Launchbounds to ensure the tile fits into a CUDA block under register
+      // constraints
+      using range_type = typename Kokkos::MDRangePolicy<
+          ExecSpace, Kokkos::LaunchBounds<256, 1>, Kokkos::Rank<6>,
+          Kokkos::IndexType<int>>;
+#endif
       using tile_type  = typename range_type::tile_type;
       using point_type = typename range_type::point_type;
 
diff --git a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
index 777f91aea3..b38871afaa 100644
--- a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp
@@ -601,7 +601,8 @@ TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.f, 3.f);
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2., 3.);
 #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
-#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))  // FIXME
+// FIXME: fails with gcc on Power platforms
+#if !(defined(KOKKOS_ARCH_POWER8) || defined(KOKKOS_ARCH_POWER9))
   do_test_math_binary_function<TEST_EXECSPACE, kk_hypot>(2.l, 3.l);
 #endif
 #endif
@@ -668,7 +669,13 @@ TEST(TEST_CATEGORY, mathematical_functions_exponential_functions) {
   TEST_MATH_FUNCTION(log10)({1234.l, 567.l, 89.l, .003l});
 #endif
 
+// FIXME_OPENMPTARGET FIXME_AMD
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) &&                           \
+    (defined(KOKKOS_ARCH_VEGA906) || defined(KOKKOS_ARCH_VEGA908) || \
+     defined(KOKKOS_ARCH_VEGA90A))
+
   TEST_MATH_FUNCTION(log2)({1, 23, 456, 7890});
+#endif
   TEST_MATH_FUNCTION(log2)({1l, 23l, 456l, 7890l});
   TEST_MATH_FUNCTION(log2)({1ll, 23ll, 456ll, 7890ll});
   TEST_MATH_FUNCTION(log2)({1u, 23u, 456u, 7890u});
@@ -869,3 +876,69 @@ TEST(TEST_CATEGORY,
 #endif
 #endif
 }
+
+template <class Space>
+struct TestAbsoluteValueFunction {
+  TestAbsoluteValueFunction() { run(); }
+  void run() const {
+    int errors = 0;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<Space>(0, 1), *this, errors);
+    ASSERT_EQ(errors, 0);
+  }
+  KOKKOS_FUNCTION void operator()(int, int& e) const {
+    using Kokkos::Experimental::abs;
+    if (abs(1) != 1 || abs(-1) != 1) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(int)\n");
+    }
+    if (abs(2l) != 2l || abs(-2l) != 2l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long int)\n");
+    }
+    if (abs(3ll) != 3ll || abs(-3ll) != 3ll) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long long int)\n");
+    }
+    if (abs(4.f) != 4.f || abs(-4.f) != 4.f) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(float)\n");
+    }
+    if (abs(5.) != 5. || abs(-5.) != 5.) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(double)\n");
+    }
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    if (abs(6.l) != 6.l || abs(-6.l) != 6.l) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF("failed abs(long double)\n");
+    }
+#endif
+    // special values
+    using Kokkos::Experimental::isinf;
+    using Kokkos::Experimental::isnan;
+    if (abs(-0.) != 0.
+    // WORKAROUND icpx changing default FP model when optimization level is >= 1
+    // using -fp-model=precise works too
+#ifndef __INTEL_LLVM_COMPILER
+        || !isinf(abs(-INFINITY)) || !isnan(abs(-NAN))
+#endif
+    ) {
+      ++e;
+      KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+          "failed abs(floating_point) special values\n");
+    }
+
+    static_assert(std::is_same<decltype(abs(1)), int>::value, "");
+    static_assert(std::is_same<decltype(abs(2l)), long>::value, "");
+    static_assert(std::is_same<decltype(abs(3ll)), long long>::value, "");
+    static_assert(std::is_same<decltype(abs(4.f)), float>::value, "");
+    static_assert(std::is_same<decltype(abs(5.)), double>::value, "");
+#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
+    static_assert(std::is_same<decltype(abs(6.l)), long double>::value, "");
+#endif
+  }
+};
+
+TEST(TEST_CATEGORY, mathematical_functions_absolute_value) {
+  TestAbsoluteValueFunction<TEST_EXECSPACE>();
+}
diff --git a/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
new file mode 100644
index 0000000000..2d9b4db6bd
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMathematicalSpecialFunctions.hpp
@@ -0,0 +1,1895 @@
+#include <fstream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+namespace Test {
+
+struct TestLargeArgTag {};
+struct TestRealErfcxTag {};
+
+template <class ExecSpace>
+struct TestExponentialIntergral1Function {
+  using ViewType     = Kokkos::View<double*, ExecSpace>;
+  using HostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_x, d_expint;
+  typename ViewType::HostMirror h_x, h_expint;
+  HostViewType h_ref;
+
+  void testit() {
+    using Kokkos::Experimental::fabs;
+    using Kokkos::Experimental::infinity;
+
+    d_x      = ViewType("d_x", 15);
+    d_expint = ViewType("d_expint", 15);
+    h_x      = Kokkos::create_mirror_view(d_x);
+    h_expint = Kokkos::create_mirror_view(d_expint);
+    h_ref    = HostViewType("h_ref", 15);
+
+    // Generate test inputs
+    h_x(0)  = -0.2;
+    h_x(1)  = 0.0;
+    h_x(2)  = 0.2;
+    h_x(3)  = 0.8;
+    h_x(4)  = 1.6;
+    h_x(5)  = 5.1;
+    h_x(6)  = 0.01;
+    h_x(7)  = 0.001;
+    h_x(8)  = 1.0;
+    h_x(9)  = 1.001;
+    h_x(10) = 1.01;
+    h_x(11) = 1.1;
+    h_x(12) = 7.2;
+    h_x(13) = 10.3;
+    h_x(14) = 15.4;
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call exponential integral function
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 15), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_expint, d_expint);
+
+    // Reference values computed with Octave
+    h_ref(0)  = -infinity<double>::value;  // x(0)=-0.2
+    h_ref(1)  = infinity<double>::value;   // x(1)= 0.0
+    h_ref(2)  = 1.222650544183893e+00;     // x(2) =0.2
+    h_ref(3)  = 3.105965785455429e-01;     // x(3) =0.8
+    h_ref(4)  = 8.630833369753976e-02;     // x(4) =1.6
+    h_ref(5)  = 1.021300107861738e-03;     // x(5) =5.1
+    h_ref(6)  = 4.037929576538113e+00;     // x(6) =0.01
+    h_ref(7)  = 6.331539364136149e+00;     // x(7) =0.001
+    h_ref(8)  = 2.193839343955205e-01;     // x(8) =1.0
+    h_ref(9)  = 2.190164225274689e-01;     // x(9) =1.001
+    h_ref(10) = 2.157416237944899e-01;     // x(10)=1.01
+    h_ref(11) = 1.859909045360401e-01;     // x(11)=1.1
+    h_ref(12) = 9.218811688716196e-05;     // x(12)=7.2
+    h_ref(13) = 2.996734771597901e-06;     // x(13)=10.3
+    h_ref(14) = 1.254522935050609e-08;     // x(14)=15.4
+
+    EXPECT_EQ(h_ref(0), h_expint(0));
+    EXPECT_EQ(h_ref(1), h_expint(1));
+    for (int i = 2; i < 15; i++) {
+      EXPECT_LE(std::abs(h_expint(i) - h_ref(i)), std::abs(h_ref(i)) * 1e-15);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_expint(i) = Kokkos::Experimental::expint1(d_x(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexErrorFunction {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+  using DblViewType     = Kokkos::View<double*, ExecSpace>;
+  using DblHostViewType = Kokkos::View<double*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_erf, d_erfcx;
+  typename ViewType::HostMirror h_z, h_erf, h_erfcx;
+  HostViewType h_ref_erf, h_ref_erfcx;
+
+  DblViewType d_x, d_erfcx_dbl;
+  typename DblViewType::HostMirror h_x, h_erfcx_dbl;
+  DblHostViewType h_ref_erfcx_dbl;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    d_z         = ViewType("d_z", 52);
+    d_erf       = ViewType("d_erf", 52);
+    d_erfcx     = ViewType("d_erfcx", 52);
+    h_z         = Kokkos::create_mirror_view(d_z);
+    h_erf       = Kokkos::create_mirror_view(d_erf);
+    h_erfcx     = Kokkos::create_mirror_view(d_erfcx);
+    h_ref_erf   = HostViewType("h_ref_erf", 52);
+    h_ref_erfcx = HostViewType("h_ref_erfcx", 52);
+
+    d_x             = DblViewType("d_x", 6);
+    d_erfcx_dbl     = DblViewType("d_erfcx_dbl", 6);
+    h_x             = Kokkos::create_mirror_view(d_x);
+    h_erfcx_dbl     = Kokkos::create_mirror_view(d_erfcx_dbl);
+    h_ref_erfcx_dbl = DblHostViewType("h_ref_erfcx_dbl", 6);
+
+    // Generate test inputs
+    // abs(z)<=2
+    h_z(0)  = Kokkos::complex<double>(0.0011, 0);
+    h_z(1)  = Kokkos::complex<double>(-0.0011, 0);
+    h_z(2)  = Kokkos::complex<double>(1.4567, 0);
+    h_z(3)  = Kokkos::complex<double>(-1.4567, 0);
+    h_z(4)  = Kokkos::complex<double>(0, 0.0011);
+    h_z(5)  = Kokkos::complex<double>(0, -0.0011);
+    h_z(6)  = Kokkos::complex<double>(0, 1.4567);
+    h_z(7)  = Kokkos::complex<double>(0, -1.4567);
+    h_z(8)  = Kokkos::complex<double>(1.4567, 0.0011);
+    h_z(9)  = Kokkos::complex<double>(1.4567, -0.0011);
+    h_z(10) = Kokkos::complex<double>(-1.4567, 0.0011);
+    h_z(11) = Kokkos::complex<double>(-1.4567, -0.0011);
+    h_z(12) = Kokkos::complex<double>(1.4567, 0.5942);
+    h_z(13) = Kokkos::complex<double>(1.4567, -0.5942);
+    h_z(14) = Kokkos::complex<double>(-1.4567, 0.5942);
+    h_z(15) = Kokkos::complex<double>(-1.4567, -0.5942);
+    h_z(16) = Kokkos::complex<double>(0.0011, 0.5942);
+    h_z(17) = Kokkos::complex<double>(0.0011, -0.5942);
+    h_z(18) = Kokkos::complex<double>(-0.0011, 0.5942);
+    h_z(19) = Kokkos::complex<double>(-0.0011, -0.5942);
+    h_z(20) = Kokkos::complex<double>(0.0011, 0.0051);
+    h_z(21) = Kokkos::complex<double>(0.0011, -0.0051);
+    h_z(22) = Kokkos::complex<double>(-0.0011, 0.0051);
+    h_z(23) = Kokkos::complex<double>(-0.0011, -0.0051);
+    // abs(z)>2.0 and x>1
+    h_z(24) = Kokkos::complex<double>(3.5, 0.0011);
+    h_z(25) = Kokkos::complex<double>(3.5, -0.0011);
+    h_z(26) = Kokkos::complex<double>(-3.5, 0.0011);
+    h_z(27) = Kokkos::complex<double>(-3.5, -0.0011);
+    h_z(28) = Kokkos::complex<double>(3.5, 9.7);
+    h_z(29) = Kokkos::complex<double>(3.5, -9.7);
+    h_z(30) = Kokkos::complex<double>(-3.5, 9.7);
+    h_z(31) = Kokkos::complex<double>(-3.5, -9.7);
+    h_z(32) = Kokkos::complex<double>(18.9, 9.7);
+    h_z(33) = Kokkos::complex<double>(18.9, -9.7);
+    h_z(34) = Kokkos::complex<double>(-18.9, 9.7);
+    h_z(35) = Kokkos::complex<double>(-18.9, -9.7);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)<6
+    h_z(36) = Kokkos::complex<double>(0.85, 3.5);
+    h_z(37) = Kokkos::complex<double>(0.85, -3.5);
+    h_z(38) = Kokkos::complex<double>(-0.85, 3.5);
+    h_z(39) = Kokkos::complex<double>(-0.85, -3.5);
+    h_z(40) = Kokkos::complex<double>(0.0011, 3.5);
+    h_z(41) = Kokkos::complex<double>(0.0011, -3.5);
+    h_z(42) = Kokkos::complex<double>(-0.0011, 3.5);
+    h_z(43) = Kokkos::complex<double>(-0.0011, -3.5);
+    // abs(z)>2.0 and 0<=x<=1 and abs(y)>=6
+    h_z(44) = Kokkos::complex<double>(0.85, 7.5);
+    h_z(45) = Kokkos::complex<double>(0.85, -7.5);
+    h_z(46) = Kokkos::complex<double>(-0.85, 7.5);
+    h_z(47) = Kokkos::complex<double>(-0.85, -7.5);
+    h_z(48) = Kokkos::complex<double>(0.85, 19.7);
+    h_z(49) = Kokkos::complex<double>(0.85, -19.7);
+    h_z(50) = Kokkos::complex<double>(-0.85, 19.7);
+    h_z(51) = Kokkos::complex<double>(-0.85, -19.7);
+
+    h_x(0) = -infinity<double>::value;
+    h_x(1) = -1.2;
+    h_x(2) = 0.0;
+    h_x(3) = 1.2;
+    h_x(4) = 10.5;
+    h_x(5) = infinity<double>::value;
+
+    Kokkos::deep_copy(d_z, h_z);
+    Kokkos::deep_copy(d_x, h_x);
+
+    // Call erf and erfcx functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, 52), *this);
+    Kokkos::fence();
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestRealErfcxTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_erf, d_erf);
+    Kokkos::deep_copy(h_erfcx, d_erfcx);
+    Kokkos::deep_copy(h_erfcx_dbl, d_erfcx_dbl);
+
+    // Reference values computed with Octave
+    h_ref_erf(0) = Kokkos::complex<double>(0.001241216583181022, 0);
+    h_ref_erf(1) = Kokkos::complex<double>(-0.001241216583181022, 0);
+    h_ref_erf(2) = Kokkos::complex<double>(0.9606095744865353, 0);
+    h_ref_erf(3) = Kokkos::complex<double>(-0.9606095744865353, 0);
+    h_ref_erf(4) = Kokkos::complex<double>(0, 0.001241217584429469);
+    h_ref_erf(5) = Kokkos::complex<double>(0, -0.001241217584429469);
+    h_ref_erf(6) = Kokkos::complex<double>(0, 4.149756424218223);
+    h_ref_erf(7) = Kokkos::complex<double>(0, -4.149756424218223);
+    h_ref_erf(8) =
+        Kokkos::complex<double>(0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(9) =
+        Kokkos::complex<double>(0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(10) =
+        Kokkos::complex<double>(-0.960609812745064, 0.0001486911741082233);
+    h_ref_erf(11) =
+        Kokkos::complex<double>(-0.960609812745064, -0.0001486911741082233);
+    h_ref_erf(12) =
+        Kokkos::complex<double>(1.02408827958197, 0.04828570635603527);
+    h_ref_erf(13) =
+        Kokkos::complex<double>(1.02408827958197, -0.04828570635603527);
+    h_ref_erf(14) =
+        Kokkos::complex<double>(-1.02408827958197, 0.04828570635603527);
+    h_ref_erf(15) =
+        Kokkos::complex<double>(-1.02408827958197, -0.04828570635603527);
+    h_ref_erf(16) =
+        Kokkos::complex<double>(0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(17) =
+        Kokkos::complex<double>(0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(18) =
+        Kokkos::complex<double>(-0.001766791817179109, 0.7585038120712589);
+    h_ref_erf(19) =
+        Kokkos::complex<double>(-0.001766791817179109, -0.7585038120712589);
+    h_ref_erf(20) =
+        Kokkos::complex<double>(0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(21) =
+        Kokkos::complex<double>(0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(22) =
+        Kokkos::complex<double>(-0.001241248867618165, 0.005754776682713324);
+    h_ref_erf(23) =
+        Kokkos::complex<double>(-0.001241248867618165, -0.005754776682713324);
+    h_ref_erf(24) =
+        Kokkos::complex<double>(0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(25) =
+        Kokkos::complex<double>(0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(26) =
+        Kokkos::complex<double>(-0.9999992569244941, 5.939313159932013e-09);
+    h_ref_erf(27) =
+        Kokkos::complex<double>(-0.9999992569244941, -5.939313159932013e-09);
+    h_ref_erf(28) =
+        Kokkos::complex<double>(-1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(29) =
+        Kokkos::complex<double>(-1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(30) =
+        Kokkos::complex<double>(1.915595842013002e+34, 1.228821279117683e+32);
+    h_ref_erf(31) =
+        Kokkos::complex<double>(1.915595842013002e+34, -1.228821279117683e+32);
+    h_ref_erf(32) = Kokkos::complex<double>(1, 5.959897539826596e-117);
+    h_ref_erf(33) = Kokkos::complex<double>(1, -5.959897539826596e-117);
+    h_ref_erf(34) = Kokkos::complex<double>(-1, 5.959897539826596e-117);
+    h_ref_erf(35) = Kokkos::complex<double>(-1, -5.959897539826596e-117);
+    h_ref_erf(36) =
+        Kokkos::complex<double>(-9211.077162784413, 13667.93825589455);
+    h_ref_erf(37) =
+        Kokkos::complex<double>(-9211.077162784413, -13667.93825589455);
+    h_ref_erf(38) =
+        Kokkos::complex<double>(9211.077162784413, 13667.93825589455);
+    h_ref_erf(39) =
+        Kokkos::complex<double>(9211.077162784413, -13667.93825589455);
+    h_ref_erf(40) = Kokkos::complex<double>(259.38847811225, 35281.28906479814);
+    h_ref_erf(41) =
+        Kokkos::complex<double>(259.38847811225, -35281.28906479814);
+    h_ref_erf(42) =
+        Kokkos::complex<double>(-259.38847811225, 35281.28906479814);
+    h_ref_erf(43) =
+        Kokkos::complex<double>(-259.38847811225, -35281.28906479814);
+    h_ref_erf(44) =
+        Kokkos::complex<double>(6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(45) =
+        Kokkos::complex<double>(6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(46) =
+        Kokkos::complex<double>(-6.752085728270252e+21, 9.809477366939276e+22);
+    h_ref_erf(47) =
+        Kokkos::complex<double>(-6.752085728270252e+21, -9.809477366939276e+22);
+    h_ref_erf(48) =
+        Kokkos::complex<double>(4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(49) =
+        Kokkos::complex<double>(4.37526734926942e+166, 2.16796709605852e+166);
+    h_ref_erf(50) =
+        Kokkos::complex<double>(-4.37526734926942e+166, -2.16796709605852e+166);
+    h_ref_erf(51) =
+        Kokkos::complex<double>(-4.37526734926942e+166, 2.16796709605852e+166);
+
+    h_ref_erfcx(0) = Kokkos::complex<double>(0.9987599919156778, 0);
+    h_ref_erfcx(1) = Kokkos::complex<double>(1.001242428085786, 0);
+    h_ref_erfcx(2) = Kokkos::complex<double>(0.3288157848563544, 0);
+    h_ref_erfcx(3) = Kokkos::complex<double>(16.36639786516915, 0);
+    h_ref_erfcx(4) =
+        Kokkos::complex<double>(0.999998790000732, -0.001241216082557101);
+    h_ref_erfcx(5) =
+        Kokkos::complex<double>(0.999998790000732, 0.001241216082557101);
+    h_ref_erfcx(6) =
+        Kokkos::complex<double>(0.1197948131677216, -0.4971192955307743);
+    h_ref_erfcx(7) =
+        Kokkos::complex<double>(0.1197948131677216, 0.4971192955307743);
+    h_ref_erfcx(8) =
+        Kokkos::complex<double>(0.3288156873503045, -0.0001874479383970247);
+    h_ref_erfcx(9) =
+        Kokkos::complex<double>(0.3288156873503045, 0.0001874479383970247);
+    h_ref_erfcx(10) =
+        Kokkos::complex<double>(16.36629202874158, -0.05369111060785572);
+    h_ref_erfcx(11) =
+        Kokkos::complex<double>(16.36629202874158, 0.05369111060785572);
+    h_ref_erfcx(12) =
+        Kokkos::complex<double>(0.3020886508118801, -0.09424097887578842);
+    h_ref_erfcx(13) =
+        Kokkos::complex<double>(0.3020886508118801, 0.09424097887578842);
+    h_ref_erfcx(14) =
+        Kokkos::complex<double>(-2.174707722732267, -11.67259764091796);
+    h_ref_erfcx(15) =
+        Kokkos::complex<double>(-2.174707722732267, 11.67259764091796);
+    h_ref_erfcx(16) =
+        Kokkos::complex<double>(0.7019810779371267, -0.5319516793968513);
+    h_ref_erfcx(17) =
+        Kokkos::complex<double>(0.7019810779371267, 0.5319516793968513);
+    h_ref_erfcx(18) =
+        Kokkos::complex<double>(0.7030703366403597, -0.5337884198542978);
+    h_ref_erfcx(19) =
+        Kokkos::complex<double>(0.7030703366403597, 0.5337884198542978);
+    h_ref_erfcx(20) =
+        Kokkos::complex<double>(0.9987340467266177, -0.005743428170378673);
+    h_ref_erfcx(21) =
+        Kokkos::complex<double>(0.9987340467266177, 0.005743428170378673);
+    h_ref_erfcx(22) =
+        Kokkos::complex<double>(1.001216353762532, -0.005765867613873103);
+    h_ref_erfcx(23) =
+        Kokkos::complex<double>(1.001216353762532, 0.005765867613873103);
+    h_ref_erfcx(24) =
+        Kokkos::complex<double>(0.1552936427089241, -4.545593205871305e-05);
+    h_ref_erfcx(25) =
+        Kokkos::complex<double>(0.1552936427089241, 4.545593205871305e-05);
+    h_ref_erfcx(26) =
+        Kokkos::complex<double>(417949.5262869648, -3218.276197742372);
+    h_ref_erfcx(27) =
+        Kokkos::complex<double>(417949.5262869648, 3218.276197742372);
+    h_ref_erfcx(28) =
+        Kokkos::complex<double>(0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(29) =
+        Kokkos::complex<double>(0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(30) =
+        Kokkos::complex<double>(-0.01879467905925653, -0.0515934271478583);
+    h_ref_erfcx(31) =
+        Kokkos::complex<double>(-0.01879467905925653, 0.0515934271478583);
+    h_ref_erfcx(32) =
+        Kokkos::complex<double>(0.02362328821805, -0.01209735551897239);
+    h_ref_erfcx(33) =
+        Kokkos::complex<double>(0.02362328821805, 0.01209735551897239);
+    h_ref_erfcx(34) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              -2.942443198107089e+114);
+    h_ref_erfcx(35) = Kokkos::complex<double>(-2.304726099084567e+114,
+                                              2.942443198107089e+114);
+    h_ref_erfcx(36) =
+        Kokkos::complex<double>(0.04174017523145063, -0.1569865319886248);
+    h_ref_erfcx(37) =
+        Kokkos::complex<double>(0.04174017523145063, 0.1569865319886248);
+    h_ref_erfcx(38) =
+        Kokkos::complex<double>(-0.04172154858670504, -0.156980085534407);
+    h_ref_erfcx(39) =
+        Kokkos::complex<double>(-0.04172154858670504, 0.156980085534407);
+    h_ref_erfcx(40) =
+        Kokkos::complex<double>(6.355803055239174e-05, -0.1688298297427782);
+    h_ref_erfcx(41) =
+        Kokkos::complex<double>(6.355803055239174e-05, 0.1688298297427782);
+    h_ref_erfcx(42) =
+        Kokkos::complex<double>(-5.398806789669434e-05, -0.168829903432947);
+    h_ref_erfcx(43) =
+        Kokkos::complex<double>(-5.398806789669434e-05, 0.168829903432947);
+    h_ref_erfcx(44) =
+        Kokkos::complex<double>(0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(45) =
+        Kokkos::complex<double>(0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(46) =
+        Kokkos::complex<double>(-0.008645103282302355, -0.07490521021566741);
+    h_ref_erfcx(47) =
+        Kokkos::complex<double>(-0.008645103282302355, 0.07490521021566741);
+    h_ref_erfcx(48) =
+        Kokkos::complex<double>(0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(49) =
+        Kokkos::complex<double>(0.001238176693606428, 0.02862247416909219);
+    h_ref_erfcx(50) =
+        Kokkos::complex<double>(-0.001238176693606428, -0.02862247416909219);
+    h_ref_erfcx(51) =
+        Kokkos::complex<double>(-0.001238176693606428, 0.02862247416909219);
+
+    h_ref_erfcx_dbl(0) = infinity<double>::value;
+    h_ref_erfcx_dbl(1) = 8.062854217063865e+00;
+    h_ref_erfcx_dbl(2) = 1.0;
+    h_ref_erfcx_dbl(3) = 3.785374169292397e-01;
+    h_ref_erfcx_dbl(4) = 5.349189974656411e-02;
+    h_ref_erfcx_dbl(5) = 0.0;
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erf(i) - h_ref_erf(i)),
+                Kokkos::abs(h_ref_erf(i)) * 1e-13);
+    }
+
+    for (int i = 0; i < 52; i++) {
+      EXPECT_LE(Kokkos::abs(h_erfcx(i) - h_ref_erfcx(i)),
+                Kokkos::abs(h_ref_erfcx(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_erfcx_dbl(0), h_ref_erfcx_dbl(0));
+    EXPECT_EQ(h_erfcx_dbl(5), h_ref_erfcx_dbl(5));
+    for (int i = 1; i < 5; i++) {
+      EXPECT_LE(std::abs(h_erfcx_dbl(i) - h_ref_erfcx_dbl(i)),
+                std::abs(h_ref_erfcx_dbl(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_erf(i)   = Kokkos::Experimental::erf(d_z(i));
+    d_erfcx(i) = Kokkos::Experimental::erfcx(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestRealErfcxTag&, const int& /*i*/) const {
+    d_erfcx_dbl(0) = Kokkos::Experimental::erfcx(d_x(0));
+    d_erfcx_dbl(1) = Kokkos::Experimental::erfcx(d_x(1));
+    d_erfcx_dbl(2) = Kokkos::Experimental::erfcx(d_x(2));
+    d_erfcx_dbl(3) = Kokkos::Experimental::erfcx(d_x(3));
+    d_erfcx_dbl(4) = Kokkos::Experimental::erfcx(d_x(4));
+    d_erfcx_dbl(5) = Kokkos::Experimental::erfcx(d_x(5));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ0Y0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj0, d_cby0;
+  typename ViewType::HostMirror h_z, h_cbj0, h_cby0;
+  HostViewType h_ref_cbj0, h_ref_cby0;
+
+  ViewType d_z_large, d_cbj0_large, d_cby0_large;
+  typename ViewType::HostMirror h_z_large, h_cbj0_large, h_cby0_large;
+  HostViewType h_ref_cbj0_large, h_ref_cby0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj0     = ViewType("d_cbj0", N);
+    d_cby0     = ViewType("d_cby0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj0     = Kokkos::create_mirror_view(d_cbj0);
+    h_cby0     = Kokkos::create_mirror_view(d_cby0);
+    h_ref_cbj0 = HostViewType("h_ref_cbj0", N);
+    h_ref_cby0 = HostViewType("h_ref_cby0", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0, d_cbj0);
+    Kokkos::deep_copy(h_cby0, d_cby0);
+
+    // Reference values computed with Octave
+    h_ref_cbj0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbj0(1) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(2) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(3) =
+        Kokkos::complex<double>(-1.249234879607422e+00, +9.479837920577351e-01);
+    h_ref_cbj0(4) =
+        Kokkos::complex<double>(-1.249234879607422e+00, -9.479837920577351e-01);
+    h_ref_cbj0(5) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(6) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(7) =
+        Kokkos::complex<double>(-1.602439981218195e+03, -7.230667451989807e+02);
+    h_ref_cbj0(8) =
+        Kokkos::complex<double>(-1.602439981218195e+03, +7.230667451989807e+02);
+    h_ref_cbj0(9) = Kokkos::complex<double>(-2.600519549019335e-01, 0);
+    h_ref_cbj0(10) =
+        Kokkos::complex<double>(-2.600519549019335e-01, +9.951051106466461e-18);
+    h_ref_cbj0(11) = Kokkos::complex<double>(-1.624127813134866e-01, 0);
+    h_ref_cbj0(12) =
+        Kokkos::complex<double>(-1.624127813134866e-01, -1.387778780781446e-17);
+    h_ref_cbj0(13) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(14) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(15) =
+        Kokkos::complex<double>(-1.012912188513958e+03, +1.256239636146142e+03);
+    h_ref_cbj0(16) =
+        Kokkos::complex<double>(-1.012912188513958e+03, -1.256239636146142e+03);
+    h_ref_cbj0(17) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(18) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(19) =
+        Kokkos::complex<double>(-1.040215134669324e+03, +4.338202386810095e+02);
+    h_ref_cbj0(20) =
+        Kokkos::complex<double>(-1.040215134669324e+03, -4.338202386810095e+02);
+    h_ref_cbj0(21) = Kokkos::complex<double>(-7.315701054899962e-02, 0);
+    h_ref_cbj0(22) =
+        Kokkos::complex<double>(-7.315701054899962e-02, -6.938893903907228e-18);
+    h_ref_cbj0(23) = Kokkos::complex<double>(-9.147180408906189e-02, 0);
+    h_ref_cbj0(24) =
+        Kokkos::complex<double>(-9.147180408906189e-02, +1.387778780781446e-17);
+
+    h_ref_cby0(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby0(1) =
+        Kokkos::complex<double>(1.000803196554890e+00, -1.231441609303427e+00);
+    h_ref_cby0(2) =
+        Kokkos::complex<double>(1.000803196554890e+00, +1.231441609303427e+00);
+    h_ref_cby0(3) =
+        Kokkos::complex<double>(-8.951643875605797e-01, -1.267028149911417e+00);
+    h_ref_cby0(4) =
+        Kokkos::complex<double>(-8.951643875605797e-01, +1.267028149911417e+00);
+    h_ref_cby0(5) =
+        Kokkos::complex<double>(-7.230667452992603e+02, -1.602439974000479e+03);
+    h_ref_cby0(6) =
+        Kokkos::complex<double>(-7.230667452992603e+02, +1.602439974000479e+03);
+    h_ref_cby0(7) =
+        Kokkos::complex<double>(7.230667450987011e+02, -1.602439988435912e+03);
+    h_ref_cby0(8) =
+        Kokkos::complex<double>(7.230667450987011e+02, +1.602439988435912e+03);
+    h_ref_cby0(9) = Kokkos::complex<double>(3.768500100127903e-01, 0);
+    h_ref_cby0(10) =
+        Kokkos::complex<double>(3.768500100127903e-01, -5.201039098038670e-01);
+    h_ref_cby0(11) = Kokkos::complex<double>(-3.598179027370283e-02, 0);
+    h_ref_cby0(12) =
+        Kokkos::complex<double>(-3.598179027370282e-02, -3.248255626269732e-01);
+    h_ref_cby0(13) =
+        Kokkos::complex<double>(1.256239642409530e+03, -1.012912186329053e+03);
+    h_ref_cby0(14) =
+        Kokkos::complex<double>(1.256239642409530e+03, +1.012912186329053e+03);
+    h_ref_cby0(15) =
+        Kokkos::complex<double>(-1.256239629882755e+03, -1.012912190698863e+03);
+    h_ref_cby0(16) =
+        Kokkos::complex<double>(-1.256239629882755e+03, +1.012912190698863e+03);
+    h_ref_cby0(17) =
+        Kokkos::complex<double>(4.338202411482646e+02, -1.040215130736213e+03);
+    h_ref_cby0(18) =
+        Kokkos::complex<double>(4.338202411482646e+02, +1.040215130736213e+03);
+    h_ref_cby0(19) =
+        Kokkos::complex<double>(-4.338202362137545e+02, -1.040215138602435e+03);
+    h_ref_cby0(20) =
+        Kokkos::complex<double>(-4.338202362137545e+02, +1.040215138602435e+03);
+    h_ref_cby0(21) = Kokkos::complex<double>(1.318364704235323e-01, 0);
+    h_ref_cby0(22) =
+        Kokkos::complex<double>(1.318364704235323e-01, -1.463140210979992e-01);
+    h_ref_cby0(23) = Kokkos::complex<double>(4.735895220944939e-02, 0);
+    h_ref_cby0(24) =
+        Kokkos::complex<double>(4.735895220944938e-02, -1.829436081781237e-01);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj0(i) - h_ref_cbj0(i)),
+                Kokkos::abs(h_ref_cbj0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby0(0), h_cby0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby0(i) - h_ref_cby0(i)),
+                Kokkos::abs(h_ref_cby0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj0_large     = ViewType("d_cbj0_large", 6);
+    d_cby0_large     = ViewType("d_cby0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj0_large     = Kokkos::create_mirror_view(d_cbj0_large);
+    h_cby0_large     = Kokkos::create_mirror_view(d_cby0_large);
+    h_ref_cbj0_large = HostViewType("h_ref_cbj0_large", 2);
+    h_ref_cby0_large = HostViewType("h_ref_cby0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj0_large, d_cbj0_large);
+    Kokkos::deep_copy(h_cby0_large, d_cby0_large);
+
+    h_ref_cbj0_large(0) =
+        Kokkos::complex<double>(-9.561811498244175e+40, -4.854995782103029e+40);
+    h_ref_cbj0_large(1) =
+        Kokkos::complex<double>(-9.561811498244175e+40, +4.854995782103029e+40);
+
+    h_ref_cby0_large(0) =
+        Kokkos::complex<double>(4.854995782103029e+40, -9.561811498244175e+40);
+    h_ref_cby0_large(1) =
+        Kokkos::complex<double>(-4.854995782103029e+40, -9.561811498244175e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) <
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(0) - h_ref_cbj0_large(0)) >
+                 Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(1) - h_ref_cbj0_large(0)) >
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(2) - h_ref_cbj0_large(0)) <
+                Kokkos::abs(h_ref_cbj0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) <
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj0_large(3) - h_ref_cbj0_large(1)) >
+                 Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(4) - h_ref_cbj0_large(1)) >
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj0_large(5) - h_ref_cbj0_large(1)) <
+                Kokkos::abs(h_ref_cbj0_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) <
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(0) - h_ref_cby0_large(0)) >
+                 Kokkos::abs(h_ref_cby0_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(1) - h_ref_cby0_large(0)) >
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(2) - h_ref_cby0_large(0)) <
+                Kokkos::abs(h_ref_cby0_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) <
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby0_large(3) - h_ref_cby0_large(1)) >
+                 Kokkos::abs(h_ref_cby0_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(4) - h_ref_cby0_large(1)) >
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby0_large(5) - h_ref_cby0_large(1)) <
+                Kokkos::abs(h_ref_cby0_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj0(i) = Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby0(i) = Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj0_large(0) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj0_large(1) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj0_large(2) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj0_large(3) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj0_large(4) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj0_large(5) =
+        Kokkos::Experimental::cyl_bessel_j0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby0_large(0) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby0_large(1) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby0_large(2) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby0_large(3) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby0_large(4) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby0_large(5) =
+        Kokkos::Experimental::cyl_bessel_y0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselJ1Y1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbj1, d_cby1;
+  typename ViewType::HostMirror h_z, h_cbj1, h_cby1;
+  HostViewType h_ref_cbj1, h_ref_cby1;
+
+  ViewType d_z_large, d_cbj1_large, d_cby1_large;
+  typename ViewType::HostMirror h_z_large, h_cbj1_large, h_cby1_large;
+  HostViewType h_ref_cbj1_large, h_ref_cby1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbj1     = ViewType("d_cbj1", N);
+    d_cby1     = ViewType("d_cby1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbj1     = Kokkos::create_mirror_view(d_cbj1);
+    h_cby1     = Kokkos::create_mirror_view(d_cby1);
+    h_ref_cbj1 = HostViewType("h_ref_cbj1", N);
+    h_ref_cby1 = HostViewType("h_ref_cby1", N);
+
+    // Generate test inputs
+    h_z(0) = Kokkos::complex<double>(0.0, 0.0);
+    // abs(z)<=25
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    // abs(z)>25
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1, d_cbj1);
+    Kokkos::deep_copy(h_cby1, d_cby1);
+
+    // Reference values computed with Octave
+    h_ref_cbj1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbj1(1) =
+        Kokkos::complex<double>(7.801488485792540e-01, -1.260982060238848e+00);
+    h_ref_cbj1(2) =
+        Kokkos::complex<double>(7.801488485792540e-01, +1.260982060238848e+00);
+    h_ref_cbj1(3) =
+        Kokkos::complex<double>(-7.801488485792543e-01, -1.260982060238848e+00);
+    h_ref_cbj1(4) =
+        Kokkos::complex<double>(-7.801488485792543e-01, +1.260982060238848e+00);
+    h_ref_cbj1(5) =
+        Kokkos::complex<double>(-7.469476253429664e+02, -1.576608505254311e+03);
+    h_ref_cbj1(6) =
+        Kokkos::complex<double>(-7.469476253429664e+02, +1.576608505254311e+03);
+    h_ref_cbj1(7) =
+        Kokkos::complex<double>(7.469476253429661e+02, -1.576608505254311e+03);
+    h_ref_cbj1(8) =
+        Kokkos::complex<double>(7.469476253429661e+02, +1.576608505254311e+03);
+    h_ref_cbj1(9) = Kokkos::complex<double>(3.390589585259365e-01, 0);
+    h_ref_cbj1(10) =
+        Kokkos::complex<double>(-3.390589585259365e-01, +3.373499138396203e-17);
+    h_ref_cbj1(11) = Kokkos::complex<double>(-3.951932188370151e-02, 0);
+    h_ref_cbj1(12) =
+        Kokkos::complex<double>(3.951932188370151e-02, +7.988560221984213e-18);
+    h_ref_cbj1(13) =
+        Kokkos::complex<double>(1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(14) =
+        Kokkos::complex<double>(1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(15) =
+        Kokkos::complex<double>(-1.233147100257312e+03, -1.027302265904111e+03);
+    h_ref_cbj1(16) =
+        Kokkos::complex<double>(-1.233147100257312e+03, +1.027302265904111e+03);
+    h_ref_cbj1(17) =
+        Kokkos::complex<double>(4.248029136732908e+02, -1.042364939115052e+03);
+    h_ref_cbj1(18) =
+        Kokkos::complex<double>(4.248029136732908e+02, +1.042364939115052e+03);
+    h_ref_cbj1(19) =
+        Kokkos::complex<double>(-4.248029136732909e+02, -1.042364939115052e+03);
+    h_ref_cbj1(20) =
+        Kokkos::complex<double>(-4.248029136732909e+02, +1.042364939115052e+03);
+    h_ref_cbj1(21) = Kokkos::complex<double>(1.305514883350938e-01, 0);
+    h_ref_cbj1(22) =
+        Kokkos::complex<double>(-1.305514883350938e-01, +7.993709105806192e-18);
+    h_ref_cbj1(23) = Kokkos::complex<double>(4.659838375816632e-02, 0);
+    h_ref_cbj1(24) =
+        Kokkos::complex<double>(-4.659838375816632e-02, +6.322680793358811e-18);
+
+    h_ref_cby1(0) = Kokkos::complex<double>(-infinity<double>::value, 0);
+    h_ref_cby1(1) =
+        Kokkos::complex<double>(1.285849341463599e+00, +7.250812532419394e-01);
+    h_ref_cby1(2) =
+        Kokkos::complex<double>(1.285849341463599e+00, -7.250812532419394e-01);
+    h_ref_cby1(3) =
+        Kokkos::complex<double>(1.236114779014097e+00, -8.352164439165690e-01);
+    h_ref_cby1(4) =
+        Kokkos::complex<double>(1.236114779014097e+00, +8.352164439165690e-01);
+    h_ref_cby1(5) =
+        Kokkos::complex<double>(1.576608512528508e+03, -7.469476251109801e+02);
+    h_ref_cby1(6) =
+        Kokkos::complex<double>(1.576608512528508e+03, +7.469476251109801e+02);
+    h_ref_cby1(7) =
+        Kokkos::complex<double>(1.576608497980113e+03, +7.469476255749524e+02);
+    h_ref_cby1(8) =
+        Kokkos::complex<double>(1.576608497980113e+03, -7.469476255749524e+02);
+    h_ref_cby1(9) = Kokkos::complex<double>(3.246744247918000e-01, 0);
+    h_ref_cby1(10) =
+        Kokkos::complex<double>(-3.246744247918000e-01, -6.781179170518730e-01);
+    h_ref_cby1(11) = Kokkos::complex<double>(1.616692009926331e-01, 0);
+    h_ref_cby1(12) =
+        Kokkos::complex<double>(-1.616692009926332e-01, +7.903864376740302e-02);
+    h_ref_cby1(13) =
+        Kokkos::complex<double>(1.027302268200224e+03, +1.233147093992241e+03);
+    h_ref_cby1(14) =
+        Kokkos::complex<double>(1.027302268200224e+03, -1.233147093992241e+03);
+    h_ref_cby1(15) =
+        Kokkos::complex<double>(1.027302263607999e+03, -1.233147106522383e+03);
+    h_ref_cby1(16) =
+        Kokkos::complex<double>(1.027302263607999e+03, +1.233147106522383e+03);
+    h_ref_cby1(17) =
+        Kokkos::complex<double>(1.042364943073579e+03, +4.248029112344685e+02);
+    h_ref_cby1(18) =
+        Kokkos::complex<double>(1.042364943073579e+03, -4.248029112344685e+02);
+    h_ref_cby1(19) =
+        Kokkos::complex<double>(1.042364935156525e+03, -4.248029161121132e+02);
+    h_ref_cby1(20) =
+        Kokkos::complex<double>(1.042364935156525e+03, +4.248029161121132e+02);
+    h_ref_cby1(21) = Kokkos::complex<double>(7.552212658226459e-02, 0);
+    h_ref_cby1(22) =
+        Kokkos::complex<double>(-7.552212658226459e-02, -2.611029766701876e-01);
+    h_ref_cby1(23) = Kokkos::complex<double>(9.186960936986688e-02, 0);
+    h_ref_cby1(24) =
+        Kokkos::complex<double>(-9.186960936986688e-02, -9.319676751633262e-02);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbj1(i) - h_ref_cbj1(i)),
+                Kokkos::abs(h_ref_cbj1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cby1(0), h_cby1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cby1(i) - h_ref_cby1(i)),
+                Kokkos::abs(h_ref_cby1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbj1_large     = ViewType("d_cbj1_large", 6);
+    d_cby1_large     = ViewType("d_cby1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbj1_large     = Kokkos::create_mirror_view(d_cbj1_large);
+    h_cby1_large     = Kokkos::create_mirror_view(d_cby1_large);
+    h_ref_cbj1_large = HostViewType("h_ref_cbj1_large", 2);
+    h_ref_cby1_large = HostViewType("h_ref_cby1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(1) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(2) = Kokkos::complex<double>(10000.0, 100.0);
+    h_z_large(3) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(4) = Kokkos::complex<double>(-10000.0, 100.0);
+    h_z_large(5) = Kokkos::complex<double>(-10000.0, 100.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbj1_large, d_cbj1_large);
+    Kokkos::deep_copy(h_cby1_large, d_cby1_large);
+
+    h_ref_cbj1_large(0) =
+        Kokkos::complex<double>(4.854515317906369e+40, -9.562049455402486e+40);
+    h_ref_cbj1_large(1) =
+        Kokkos::complex<double>(-4.854515317906371e+40, -9.562049455402486e+40);
+
+    h_ref_cby1_large(0) =
+        Kokkos::complex<double>(9.562049455402486e+40, 4.854515317906369e+40);
+    h_ref_cby1_large(1) =
+        Kokkos::complex<double>(9.562049455402486e+40, -4.854515317906369e+40);
+
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) <
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(0) - h_ref_cbj1_large(0)) >
+                 Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(1) - h_ref_cbj1_large(0)) >
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(2) - h_ref_cbj1_large(0)) <
+                Kokkos::abs(h_ref_cbj1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) <
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cbj1_large(3) - h_ref_cbj1_large(1)) >
+                 Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(4) - h_ref_cbj1_large(1)) >
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cbj1_large(5) - h_ref_cbj1_large(1)) <
+                Kokkos::abs(h_ref_cbj1_large(1)) * 1e-13);
+
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) <
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(0) - h_ref_cby1_large(0)) >
+                 Kokkos::abs(h_ref_cby1_large(0)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(1) - h_ref_cby1_large(0)) >
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(2) - h_ref_cby1_large(0)) <
+                Kokkos::abs(h_ref_cby1_large(0)) * 1e-13);
+    EXPECT_TRUE((Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) <
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-12) &&
+                (Kokkos::abs(h_cby1_large(3) - h_ref_cby1_large(1)) >
+                 Kokkos::abs(h_ref_cby1_large(1)) * 1e-13));
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(4) - h_ref_cby1_large(1)) >
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-6);
+    EXPECT_TRUE(Kokkos::abs(h_cby1_large(5) - h_ref_cby1_large(1)) <
+                Kokkos::abs(h_ref_cby1_large(1)) * 1e-13);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbj1(i) = Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cby1(i) = Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbj1_large(0) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbj1_large(1) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cbj1_large(2) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cbj1_large(3) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbj1_large(4) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cbj1_large(5) =
+        Kokkos::Experimental::cyl_bessel_j1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+
+    d_cby1_large(0) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cby1_large(1) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 11000, 3000);
+    d_cby1_large(2) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 11000, 7500);
+    d_cby1_large(3) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cby1_large(4) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 11000, 3000);
+    d_cby1_large(5) =
+        Kokkos::Experimental::cyl_bessel_y1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 11000, 7500);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI0K0Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi0, d_cbk0;
+  typename ViewType::HostMirror h_z, h_cbi0, h_cbk0;
+  HostViewType h_ref_cbi0, h_ref_cbk0;
+
+  ViewType d_z_large, d_cbi0_large, d_cbk0_large;
+  typename ViewType::HostMirror h_z_large, h_cbi0_large, h_cbk0_large;
+  HostViewType h_ref_cbi0_large, h_ref_cbk0_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi0     = ViewType("d_cbi0", N);
+    d_cbk0     = ViewType("d_cbk0", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi0     = Kokkos::create_mirror_view(d_cbi0);
+    h_cbk0     = Kokkos::create_mirror_view(d_cbk0);
+    h_ref_cbi0 = HostViewType("h_ref_cbi0", N);
+    h_ref_cbk0 = HostViewType("h_ref_cbk0", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0, d_cbi0);
+    Kokkos::deep_copy(h_cbk0, d_cbk0);
+
+    // Reference values computed with Octave
+    h_ref_cbi0(0) = Kokkos::complex<double>(1.000000000000000e+00, 0);
+    h_ref_cbi0(1) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(2) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(3) =
+        Kokkos::complex<double>(-4.695171920440706e-01, -4.313788409468920e+00);
+    h_ref_cbi0(4) =
+        Kokkos::complex<double>(-4.695171920440706e-01, +4.313788409468920e+00);
+    h_ref_cbi0(5) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(6) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(7) =
+        Kokkos::complex<double>(-7.276526052028507e+08, +2.806354803468570e+08);
+    h_ref_cbi0(8) =
+        Kokkos::complex<double>(-7.276526052028507e+08, -2.806354803468570e+08);
+    h_ref_cbi0(9)  = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(10) = Kokkos::complex<double>(4.880792585865025e+00, 0);
+    h_ref_cbi0(11) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(12) = Kokkos::complex<double>(8.151421225128924e+08, 0);
+    h_ref_cbi0(13) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(14) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(15) =
+        Kokkos::complex<double>(-9.775983282455373e+10, +4.159160389327644e+10);
+    h_ref_cbi0(16) =
+        Kokkos::complex<double>(-9.775983282455373e+10, -4.159160389327644e+10);
+    h_ref_cbi0(17) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(18) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(19) =
+        Kokkos::complex<double>(-5.158377566681892e+24, +2.766704059464302e+24);
+    h_ref_cbi0(20) =
+        Kokkos::complex<double>(-5.158377566681892e+24, -2.766704059464302e+24);
+    h_ref_cbi0(21) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(22) = Kokkos::complex<double>(1.095346047317573e+11, 0);
+    h_ref_cbi0(23) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+    h_ref_cbi0(24) = Kokkos::complex<double>(5.894077055609803e+24, 0);
+
+    h_ref_cbk0(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk0(1) =
+        Kokkos::complex<double>(-2.078722558742977e-02, -2.431266356716766e-02);
+    h_ref_cbk0(2) =
+        Kokkos::complex<double>(-2.078722558742977e-02, +2.431266356716766e-02);
+    h_ref_cbk0(3) =
+        Kokkos::complex<double>(-1.357295320191579e+01, +1.499344424826928e+00);
+    h_ref_cbk0(4) =
+        Kokkos::complex<double>(-1.357295320191579e+01, -1.499344424826928e+00);
+    h_ref_cbk0(5) =
+        Kokkos::complex<double>(-1.820476218131465e-11, +1.795056004780177e-11);
+    h_ref_cbk0(6) =
+        Kokkos::complex<double>(-1.820476218131465e-11, -1.795056004780177e-11);
+    h_ref_cbk0(7) =
+        Kokkos::complex<double>(8.816423633943287e+08, +2.285988078870750e+09);
+    h_ref_cbk0(8) =
+        Kokkos::complex<double>(8.816423633943287e+08, -2.285988078870750e+09);
+    h_ref_cbk0(9) = Kokkos::complex<double>(3.473950438627926e-02, 0);
+    h_ref_cbk0(10) =
+        Kokkos::complex<double>(3.473950438627926e-02, -1.533346213144909e+01);
+    h_ref_cbk0(11) = Kokkos::complex<double>(2.667545110351910e-11, 0);
+    h_ref_cbk0(12) =
+        Kokkos::complex<double>(2.667545110351910e-11, -2.560844503718094e+09);
+    h_ref_cbk0(13) =
+        Kokkos::complex<double>(-1.163319528590747e-13, +1.073711234918388e-13);
+    h_ref_cbk0(14) =
+        Kokkos::complex<double>(-1.163319528590747e-13, -1.073711234918388e-13);
+    h_ref_cbk0(15) =
+        Kokkos::complex<double>(1.306638772421339e+11, +3.071215726177843e+11);
+    h_ref_cbk0(16) =
+        Kokkos::complex<double>(1.306638772421339e+11, -3.071215726177843e+11);
+    h_ref_cbk0(17) =
+        Kokkos::complex<double>(-1.111584549467388e-27, +8.581979311477652e-28);
+    h_ref_cbk0(18) =
+        Kokkos::complex<double>(-1.111584549467388e-27, -8.581979311477652e-28);
+    h_ref_cbk0(19) =
+        Kokkos::complex<double>(8.691857147870108e+24, +1.620552106793022e+25);
+    h_ref_cbk0(20) =
+        Kokkos::complex<double>(8.691857147870108e+24, -1.620552106793022e+25);
+    h_ref_cbk0(21) = Kokkos::complex<double>(1.630534586888181e-13, 0);
+    h_ref_cbk0(22) =
+        Kokkos::complex<double>(1.630534586888181e-13, -3.441131095391506e+11);
+    h_ref_cbk0(23) = Kokkos::complex<double>(1.413897840559108e-27, 0);
+    h_ref_cbk0(24) =
+        Kokkos::complex<double>(1.413897840559108e-27, -1.851678917759592e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi0(i) - h_ref_cbi0(i)),
+                Kokkos::abs(h_ref_cbi0(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk0(0), h_cbk0(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk0(i) - h_ref_cbk0(i)),
+                Kokkos::abs(h_ref_cbk0(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi0_large     = ViewType("d_cbi0_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi0_large     = Kokkos::create_mirror_view(d_cbi0_large);
+    h_ref_cbi0_large = HostViewType("h_ref_cbi0_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi0_large, d_cbi0_large);
+
+    h_ref_cbi0_large(0) =
+        Kokkos::complex<double>(-9.266819049505678e+41, -5.370779383266049e+41);
+    h_ref_cbi0_large(1) =
+        Kokkos::complex<double>(-9.266819049505678e+41, +5.370779383266049e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(0) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(1) - h_ref_cbi0_large(0)) >
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(2) - h_ref_cbi0_large(0)) <
+                Kokkos::abs(h_ref_cbi0_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(3) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(4) - h_ref_cbi0_large(1)) >
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi0_large(5) - h_ref_cbi0_large(1)) <
+                Kokkos::abs(h_ref_cbi0_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi0(i) = Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk0(i) = Kokkos::Experimental::cyl_bessel_k0<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi0_large(0) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi0_large(1) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi0_large(2) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi0_large(3) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi0_large(4) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi0_large(5) =
+        Kokkos::Experimental::cyl_bessel_i0<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselI1K1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_cbi1, d_cbk1;
+  typename ViewType::HostMirror h_z, h_cbi1, h_cbk1;
+  HostViewType h_ref_cbi1, h_ref_cbk1;
+
+  ViewType d_z_large, d_cbi1_large, d_cbk1_large;
+  typename ViewType::HostMirror h_z_large, h_cbi1_large, h_cbk1_large;
+  HostViewType h_ref_cbi1_large, h_ref_cbk1_large;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_cbi1     = ViewType("d_cbi1", N);
+    d_cbk1     = ViewType("d_cbk1", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_cbi1     = Kokkos::create_mirror_view(d_cbi1);
+    h_cbk1     = Kokkos::create_mirror_view(d_cbk1);
+    h_ref_cbi1 = HostViewType("h_ref_cbi1", N);
+    h_ref_cbk1 = HostViewType("h_ref_cbk1", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(60.0, 10.0);
+    h_z(18) = Kokkos::complex<double>(60.0, -10.0);
+    h_z(19) = Kokkos::complex<double>(-60.0, 10.0);
+    h_z(20) = Kokkos::complex<double>(-60.0, -10.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(60.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-60.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Bessel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1, d_cbi1);
+    Kokkos::deep_copy(h_cbk1, d_cbk1);
+
+    // Reference values computed with Octave
+    h_ref_cbi1(0) = Kokkos::complex<double>(0, 0);
+    h_ref_cbi1(1) =
+        Kokkos::complex<double>(-8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(2) =
+        Kokkos::complex<double>(-8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(3) =
+        Kokkos::complex<double>(8.127809410735776e-01, +3.780682961371298e+00);
+    h_ref_cbi1(4) =
+        Kokkos::complex<double>(8.127809410735776e-01, -3.780682961371298e+00);
+    h_ref_cbi1(5) =
+        Kokkos::complex<double>(-7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(6) =
+        Kokkos::complex<double>(-7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(7) =
+        Kokkos::complex<double>(7.119745937677552e+08, -2.813616375214342e+08);
+    h_ref_cbi1(8) =
+        Kokkos::complex<double>(7.119745937677552e+08, +2.813616375214342e+08);
+    h_ref_cbi1(9)  = Kokkos::complex<double>(3.953370217402609e+00, 0);
+    h_ref_cbi1(10) = Kokkos::complex<double>(-3.953370217402609e+00, 0);
+    h_ref_cbi1(11) = Kokkos::complex<double>(7.972200260896506e+08, 0);
+    h_ref_cbi1(12) = Kokkos::complex<double>(-7.972200260896506e+08, 0);
+    h_ref_cbi1(13) =
+        Kokkos::complex<double>(-9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(14) =
+        Kokkos::complex<double>(-9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(15) =
+        Kokkos::complex<double>(9.596150723281404e+10, -4.149038020045121e+10);
+    h_ref_cbi1(16) =
+        Kokkos::complex<double>(9.596150723281404e+10, +4.149038020045121e+10);
+    h_ref_cbi1(17) =
+        Kokkos::complex<double>(-5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(18) =
+        Kokkos::complex<double>(-5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(19) =
+        Kokkos::complex<double>(5.112615594220387e+24, -2.751210232069100e+24);
+    h_ref_cbi1(20) =
+        Kokkos::complex<double>(5.112615594220387e+24, +2.751210232069100e+24);
+    h_ref_cbi1(21) = Kokkos::complex<double>(1.075605042080823e+11, 0);
+    h_ref_cbi1(22) = Kokkos::complex<double>(-1.075605042080823e+11, 0);
+    h_ref_cbi1(23) = Kokkos::complex<double>(5.844751588390470e+24, 0);
+    h_ref_cbi1(24) = Kokkos::complex<double>(-5.844751588390470e+24, 0);
+
+    h_ref_cbk1(0) = Kokkos::complex<double>(infinity<double>::value, 0);
+    h_ref_cbk1(1) =
+        Kokkos::complex<double>(-2.480952007015153e-02, -2.557074905635180e-02);
+    h_ref_cbk1(2) =
+        Kokkos::complex<double>(-2.480952007015153e-02, +2.557074905635180e-02);
+    h_ref_cbk1(3) =
+        Kokkos::complex<double>(-1.185255629692602e+01, +2.527855884398198e+00);
+    h_ref_cbk1(4) =
+        Kokkos::complex<double>(-1.185255629692602e+01, -2.527855884398198e+00);
+    h_ref_cbk1(5) =
+        Kokkos::complex<double>(-1.839497240093994e-11, +1.841855854336314e-11);
+    h_ref_cbk1(6) =
+        Kokkos::complex<double>(-1.839497240093994e-11, -1.841855854336314e-11);
+    h_ref_cbk1(7) =
+        Kokkos::complex<double>(8.839236534393319e+08, +2.236734153323357e+09);
+    h_ref_cbk1(8) =
+        Kokkos::complex<double>(8.839236534393319e+08, -2.236734153323357e+09);
+    h_ref_cbk1(9) = Kokkos::complex<double>(4.015643112819419e-02, 0);
+    h_ref_cbk1(10) =
+        Kokkos::complex<double>(-4.015643112819419e-02, -1.241987883191272e+01);
+    h_ref_cbk1(11) = Kokkos::complex<double>(2.724930589574976e-11, 0);
+    h_ref_cbk1(12) =
+        Kokkos::complex<double>(-2.724930589574976e-11, -2.504540577257910e+09);
+    h_ref_cbk1(13) =
+        Kokkos::complex<double>(-1.175637676331817e-13, +1.097080943197297e-13);
+    h_ref_cbk1(14) =
+        Kokkos::complex<double>(-1.175637676331817e-13, -1.097080943197297e-13);
+    h_ref_cbk1(15) =
+        Kokkos::complex<double>(1.303458736323849e+11, +3.014719661500124e+11);
+    h_ref_cbk1(16) =
+        Kokkos::complex<double>(1.303458736323849e+11, -3.014719661500124e+11);
+    h_ref_cbk1(17) =
+        Kokkos::complex<double>(-1.119411861396158e-27, +8.666195226392352e-28);
+    h_ref_cbk1(18) =
+        Kokkos::complex<double>(-1.119411861396158e-27, -8.666195226392352e-28);
+    h_ref_cbk1(19) =
+        Kokkos::complex<double>(8.643181853549355e+24, +1.606175559143138e+25);
+    h_ref_cbk1(20) =
+        Kokkos::complex<double>(8.643181853549355e+24, -1.606175559143138e+25);
+    h_ref_cbk1(21) = Kokkos::complex<double>(1.659400107332009e-13, 0);
+    h_ref_cbk1(22) =
+        Kokkos::complex<double>(-1.659400107332009e-13, -3.379112898365253e+11);
+    h_ref_cbk1(23) = Kokkos::complex<double>(1.425632026517104e-27, 0);
+    h_ref_cbk1(24) =
+        Kokkos::complex<double>(-1.425632026517104e-27, -1.836182865214478e+25);
+
+    for (int i = 0; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbi1(i) - h_ref_cbi1(i)),
+                Kokkos::abs(h_ref_cbi1(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_cbk1(0), h_cbk1(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_cbk1(i) - h_ref_cbk1(i)),
+                Kokkos::abs(h_ref_cbk1(i)) * 1e-13);
+    }
+
+    ////Test large arguments
+    d_z_large        = ViewType("d_z_large", 6);
+    d_cbi1_large     = ViewType("d_cbi1_large", 6);
+    h_z_large        = Kokkos::create_mirror_view(d_z_large);
+    h_cbi1_large     = Kokkos::create_mirror_view(d_cbi1_large);
+    h_ref_cbi1_large = HostViewType("h_ref_cbi1_large", 2);
+
+    h_z_large(0) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(1) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(2) = Kokkos::complex<double>(100.0, 10.0);
+    h_z_large(3) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(4) = Kokkos::complex<double>(-100.0, 10.0);
+    h_z_large(5) = Kokkos::complex<double>(-100.0, 10.0);
+
+    Kokkos::deep_copy(d_z_large, h_z_large);
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace, TestLargeArgTag>(0, 1),
+                         *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_cbi1_large, d_cbi1_large);
+
+    h_ref_cbi1_large(0) =
+        Kokkos::complex<double>(-9.218158020154234e+41, -5.348736158968607e+41);
+    h_ref_cbi1_large(1) =
+        Kokkos::complex<double>(9.218158020154234e+41, -5.348736158968607e+41);
+
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(0) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(1) - h_ref_cbi1_large(0)) >
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(2) - h_ref_cbi1_large(0)) <
+                Kokkos::abs(h_ref_cbi1_large(0)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(3) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(4) - h_ref_cbi1_large(1)) >
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-4);
+    EXPECT_TRUE(Kokkos::abs(h_cbi1_large(5) - h_ref_cbi1_large(1)) <
+                Kokkos::abs(h_ref_cbi1_large(1)) * 1e-15);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_cbi1(i) = Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+    d_cbk1(i) = Kokkos::Experimental::cyl_bessel_k1<Kokkos::complex<double>,
+                                                    double, int>(d_z(i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TestLargeArgTag&, const int& /*i*/) const {
+    d_cbi1_large(0) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(0));
+    d_cbi1_large(1) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(1), 110, 35);
+    d_cbi1_large(2) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(2), 110, 190);
+    d_cbi1_large(3) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(3));
+    d_cbi1_large(4) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(4), 110, 35);
+    d_cbi1_large(5) =
+        Kokkos::Experimental::cyl_bessel_i1<Kokkos::complex<double>, double,
+                                            int>(d_z_large(5), 110, 190);
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH1Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch10, d_ch11;
+  typename ViewType::HostMirror h_z, h_ch10, h_ch11;
+  HostViewType h_ref_ch10, h_ref_ch11;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch10     = ViewType("d_ch10", N);
+    d_ch11     = ViewType("d_ch11", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch10     = Kokkos::create_mirror_view(d_ch10);
+    h_ch11     = Kokkos::create_mirror_view(d_ch11);
+    h_ref_ch10 = HostViewType("h_ref_ch10", N);
+    h_ref_ch11 = HostViewType("h_ref_ch11", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch10, d_ch10);
+    Kokkos::deep_copy(h_ch11, d_ch11);
+
+    // Reference values computed with Octave
+    h_ref_ch10(0) = Kokkos::complex<double>(1.0, -infinity<double>::value);
+    h_ref_ch10(1) =
+        Kokkos::complex<double>(-1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(2) =
+        Kokkos::complex<double>(-2.480676488910849e+00, +1.948786988612626e+00);
+    h_ref_ch10(3) =
+        Kokkos::complex<double>(1.779327030399459e-02, +5.281940449715537e-02);
+    h_ref_ch10(4) =
+        Kokkos::complex<double>(-2.516263029518839e+00, -1.843148179618315e+00);
+    h_ref_ch10(5) =
+        Kokkos::complex<double>(-7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(6) =
+        Kokkos::complex<double>(-3.204879955218674e+03, -1.446133490498241e+03);
+    h_ref_ch10(7) =
+        Kokkos::complex<double>(7.217716938222564e-06, -1.002796203581228e-07);
+    h_ref_ch10(8) =
+        Kokkos::complex<double>(-3.204879969654108e+03, +1.446133490297682e+03);
+    h_ref_ch10(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(10) =
+        Kokkos::complex<double>(2.600519549019334e-01, +3.768500100127903e-01);
+    h_ref_ch10(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(12) =
+        Kokkos::complex<double>(1.624127813134865e-01, -3.598179027370283e-02);
+    h_ref_ch10(13) =
+        Kokkos::complex<double>(-2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(14) =
+        Kokkos::complex<double>(-2.025824374843011e+03, +2.512479278555672e+03);
+    h_ref_ch10(15) =
+        Kokkos::complex<double>(2.184905481759440e-06, +6.263387166445335e-06);
+    h_ref_ch10(16) =
+        Kokkos::complex<double>(-2.025824379212821e+03, -2.512479266028897e+03);
+    h_ref_ch10(17) =
+        Kokkos::complex<double>(-1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(18) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch10(19) =
+        Kokkos::complex<double>(1.983689762743337e-28, -4.408449940359881e-28);
+    h_ref_ch10(20) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch10(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(22) =
+        Kokkos::complex<double>(7.315701054899959e-02, +1.318364704235323e-01);
+    h_ref_ch10(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, -5.426577524981793e-02);
+    h_ref_ch10(24) =
+        Kokkos::complex<double>(1.543743993056510e-02, -5.426577524981793e-02);
+
+    h_ref_ch11(0) = Kokkos::complex<double>(0.0, -infinity<double>::value);
+    h_ref_ch11(1) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch11(2) =
+        Kokkos::complex<double>(1.505230101821194e+00, +2.546831401702448e+00);
+    h_ref_ch11(3) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch11(4) =
+        Kokkos::complex<double>(-1.615365292495823e+00, +2.497096839252946e+00);
+    h_ref_ch11(5) =
+        Kokkos::complex<double>(-2.319863729607219e-07, +7.274197719836158e-06);
+    h_ref_ch11(6) =
+        Kokkos::complex<double>(-1.493895250453947e+03, +3.153217017782819e+03);
+    h_ref_ch11(7) =
+        Kokkos::complex<double>(-2.319863729607210e-07, -7.274197719836158e-06);
+    h_ref_ch11(8) =
+        Kokkos::complex<double>(1.493895250917918e+03, +3.153217003234423e+03);
+    h_ref_ch11(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, +3.246744247918000e-01);
+    h_ref_ch11(10) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch11(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, +1.616692009926331e-01);
+    h_ref_ch11(12) =
+        Kokkos::complex<double>(-3.951932188370151e-02, -1.616692009926331e-01);
+    h_ref_ch11(13) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347948e-06);
+    h_ref_ch11(14) =
+        Kokkos::complex<double>(2.466294194249553e+03, +2.054604534104335e+03);
+    h_ref_ch11(15) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347947e-06);
+    h_ref_ch11(16) =
+        Kokkos::complex<double>(-2.466294206779695e+03, +2.054604529512110e+03);
+    h_ref_ch11(17) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825768e-28);
+    h_ref_ch11(18) =
+        Kokkos::complex<double>(-6.250095237987940e+24, +8.112776606830997e+23);
+    h_ref_ch11(19) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825769e-28);
+    h_ref_ch11(20) =
+        Kokkos::complex<double>(6.250095237987940e+24, +8.112776606831005e+23);
+    h_ref_ch11(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, +7.552212658226459e-02);
+    h_ref_ch11(22) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226456e-02);
+    h_ref_ch11(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, +1.530182458038999e-02);
+    h_ref_ch11(24) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch10(0), h_ch10(0));
+    std::cout << "h_ch10(0): " << h_ch10(0)
+              << ", h_ref_ch10(0): " << h_ref_ch10(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch10(i) - h_ref_ch10(i)),
+                Kokkos::abs(h_ref_ch10(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch10(i) - h_ref_ch10(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch10(i)) * 1e-13
+                << std::endl;
+    }
+
+    EXPECT_EQ(h_ref_ch11(0), h_ch11(0));
+    std::cout << "h_ch11(0): " << h_ch11(0)
+              << ", h_ref_ch11(0): " << h_ref_ch11(0) << std::endl;
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch11(i) - h_ref_ch11(i)),
+                Kokkos::abs(h_ref_ch11(i)) * 1e-13);
+      std::cout << i
+                << ", actual diff: " << Kokkos::abs(h_ch11(i) - h_ref_ch11(i))
+                << ", expected diff: " << Kokkos::abs(h_ref_ch11(i)) * 1e-13
+                << std::endl;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch10(i) = Kokkos::Experimental::cyl_bessel_h10(d_z(i));
+    d_ch11(i) = Kokkos::Experimental::cyl_bessel_h11(d_z(i));
+  }
+};
+
+template <class ExecSpace>
+struct TestComplexBesselH2Function {
+  using ViewType = Kokkos::View<Kokkos::complex<double>*, ExecSpace>;
+  using HostViewType =
+      Kokkos::View<Kokkos::complex<double>*, Kokkos::HostSpace>;
+
+  ViewType d_z, d_ch20, d_ch21;
+  typename ViewType::HostMirror h_z, h_ch20, h_ch21;
+  HostViewType h_ref_ch20, h_ref_ch21;
+
+  void testit() {
+    using Kokkos::Experimental::infinity;
+
+    int N      = 25;
+    d_z        = ViewType("d_z", N);
+    d_ch20     = ViewType("d_ch20", N);
+    d_ch21     = ViewType("d_ch21", N);
+    h_z        = Kokkos::create_mirror_view(d_z);
+    h_ch20     = Kokkos::create_mirror_view(d_ch20);
+    h_ch21     = Kokkos::create_mirror_view(d_ch21);
+    h_ref_ch20 = HostViewType("h_ref_ch20", N);
+    h_ref_ch21 = HostViewType("h_ref_ch21", N);
+
+    // Generate test inputs
+    h_z(0)  = Kokkos::complex<double>(0.0, 0.0);
+    h_z(1)  = Kokkos::complex<double>(3.0, 2.0);
+    h_z(2)  = Kokkos::complex<double>(3.0, -2.0);
+    h_z(3)  = Kokkos::complex<double>(-3.0, 2.0);
+    h_z(4)  = Kokkos::complex<double>(-3.0, -2.0);
+    h_z(5)  = Kokkos::complex<double>(23.0, 10.0);
+    h_z(6)  = Kokkos::complex<double>(23.0, -10.0);
+    h_z(7)  = Kokkos::complex<double>(-23.0, 10.0);
+    h_z(8)  = Kokkos::complex<double>(-23.0, -10.0);
+    h_z(9)  = Kokkos::complex<double>(3.0, 0.0);
+    h_z(10) = Kokkos::complex<double>(-3.0, 0.0);
+    h_z(11) = Kokkos::complex<double>(23.0, 0.0);
+    h_z(12) = Kokkos::complex<double>(-23.0, 0.0);
+    h_z(13) = Kokkos::complex<double>(28.0, 10.0);
+    h_z(14) = Kokkos::complex<double>(28.0, -10.0);
+    h_z(15) = Kokkos::complex<double>(-28.0, 10.0);
+    h_z(16) = Kokkos::complex<double>(-28.0, -10.0);
+    h_z(17) = Kokkos::complex<double>(200.0, 60.0);
+    h_z(18) = Kokkos::complex<double>(200.0, -60.0);
+    h_z(19) = Kokkos::complex<double>(-200.0, 60.0);
+    h_z(20) = Kokkos::complex<double>(-200.0, -60.0);
+    h_z(21) = Kokkos::complex<double>(28.0, 0.0);
+    h_z(22) = Kokkos::complex<double>(-28.0, 0.0);
+    h_z(23) = Kokkos::complex<double>(200.0, 0.0);
+    h_z(24) = Kokkos::complex<double>(-200.0, 0.0);
+
+    Kokkos::deep_copy(d_z, h_z);
+
+    // Call Hankel functions
+    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0, N), *this);
+    Kokkos::fence();
+
+    Kokkos::deep_copy(h_ch20, d_ch20);
+    Kokkos::deep_copy(h_ch21, d_ch21);
+
+    // Reference values computed with Octave
+    h_ref_ch20(0) = Kokkos::complex<double>(1.0, infinity<double>::value);
+    h_ref_ch20(1) =
+        Kokkos::complex<double>(-2.480676488910849e+00, -1.948786988612626e+00);
+    h_ref_ch20(2) =
+        Kokkos::complex<double>(-1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(3) =
+        Kokkos::complex<double>(-2.516263029518839e+00, +1.843148179618315e+00);
+    h_ref_ch20(4) =
+        Kokkos::complex<double>(1.779327030399459e-02, -5.281940449715537e-02);
+    h_ref_ch20(5) =
+        Kokkos::complex<double>(-3.204879955218674e+03, +1.446133490498241e+03);
+    h_ref_ch20(6) =
+        Kokkos::complex<double>(-7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(7) =
+        Kokkos::complex<double>(-3.204879969654108e+03, -1.446133490297682e+03);
+    h_ref_ch20(8) =
+        Kokkos::complex<double>(7.217716938222564e-06, +1.002796203581228e-07);
+    h_ref_ch20(9) =
+        Kokkos::complex<double>(-2.600519549019334e-01, -3.768500100127903e-01);
+    h_ref_ch20(10) =
+        Kokkos::complex<double>(-7.801558647058006e-01, -3.768500100127903e-01);
+    h_ref_ch20(11) =
+        Kokkos::complex<double>(-1.624127813134865e-01, +3.598179027370283e-02);
+    h_ref_ch20(12) =
+        Kokkos::complex<double>(-4.872383439404597e-01, +3.598179027370281e-02);
+    h_ref_ch20(13) =
+        Kokkos::complex<double>(-2.025824374843011e+03, -2.512479278555672e+03);
+    h_ref_ch20(14) =
+        Kokkos::complex<double>(-2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(15) =
+        Kokkos::complex<double>(-2.025824379212821e+03, +2.512479266028897e+03);
+    h_ref_ch20(16) =
+        Kokkos::complex<double>(2.184905481759440e-06, -6.263387166445335e-06);
+    h_ref_ch20(17) =
+        Kokkos::complex<double>(-8.261945332108929e+23, +6.252486138159269e+24);
+    h_ref_ch20(18) =
+        Kokkos::complex<double>(-1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(19) =
+        Kokkos::complex<double>(-8.261945332108929e+23, -6.252486138159269e+24);
+    h_ref_ch20(20) =
+        Kokkos::complex<double>(1.983689762743337e-28, +4.408449940359881e-28);
+    h_ref_ch20(21) =
+        Kokkos::complex<double>(-7.315701054899959e-02, -1.318364704235323e-01);
+    h_ref_ch20(22) =
+        Kokkos::complex<double>(-2.194710316469988e-01, -1.318364704235323e-01);
+    h_ref_ch20(23) =
+        Kokkos::complex<double>(-1.543743993056510e-02, +5.426577524981793e-02);
+    h_ref_ch20(24) =
+        Kokkos::complex<double>(-4.631231979169528e-02, +5.426577524981793e-02);
+
+    h_ref_ch21(0) = Kokkos::complex<double>(0.0, infinity<double>::value);
+    h_ref_ch21(1) =
+        Kokkos::complex<double>(1.505230101821194e+00, -2.546831401702448e+00);
+    h_ref_ch21(2) =
+        Kokkos::complex<double>(5.506759533731469e-02, -2.486728122475093e-02);
+    h_ref_ch21(3) =
+        Kokkos::complex<double>(-1.615365292495823e+00, -2.497096839252946e+00);
+    h_ref_ch21(4) =
+        Kokkos::complex<double>(5.506759533731469e-02, +2.486728122475093e-02);
+    h_ref_ch21(5) =
+        Kokkos::complex<double>(-1.493895250453947e+03, -3.153217017782819e+03);
+    h_ref_ch21(6) =
+        Kokkos::complex<double>(-2.319863729607219e-07, -7.274197719836158e-06);
+    h_ref_ch21(7) =
+        Kokkos::complex<double>(1.493895250917918e+03, -3.153217003234423e+03);
+    h_ref_ch21(8) =
+        Kokkos::complex<double>(-2.319863729607210e-07, +7.274197719836158e-06);
+    h_ref_ch21(9) =
+        Kokkos::complex<double>(3.390589585259364e-01, -3.246744247918000e-01);
+    h_ref_ch21(10) =
+        Kokkos::complex<double>(-1.017176875577809e+00, +3.246744247918000e-01);
+    h_ref_ch21(11) =
+        Kokkos::complex<double>(-3.951932188370152e-02, -1.616692009926331e-01);
+    h_ref_ch21(12) =
+        Kokkos::complex<double>(1.185579656511045e-01, +1.616692009926332e-01);
+    h_ref_ch21(13) =
+        Kokkos::complex<double>(2.466294194249553e+03, -2.054604534104335e+03);
+    h_ref_ch21(14) =
+        Kokkos::complex<double>(6.265071091331731e-06, -2.296112637347948e-06);
+    h_ref_ch21(15) =
+        Kokkos::complex<double>(-2.466294206779695e+03, -2.054604529512110e+03);
+    h_ref_ch21(16) =
+        Kokkos::complex<double>(6.265071091331731e-06, +2.296112637347947e-06);
+    h_ref_ch21(17) =
+        Kokkos::complex<double>(-6.250095237987940e+24, -8.112776606830997e+23);
+    h_ref_ch21(18) =
+        Kokkos::complex<double>(-4.416040381930448e-28, -1.974955285825768e-28);
+    h_ref_ch21(19) =
+        Kokkos::complex<double>(6.250095237987940e+24, -8.112776606831005e+23);
+    h_ref_ch21(20) =
+        Kokkos::complex<double>(-4.416040381930448e-28, +1.974955285825769e-28);
+    h_ref_ch21(21) =
+        Kokkos::complex<double>(1.305514883350938e-01, -7.552212658226459e-02);
+    h_ref_ch21(22) =
+        Kokkos::complex<double>(-3.916544650052814e-01, +7.552212658226461e-02);
+    h_ref_ch21(23) =
+        Kokkos::complex<double>(-5.430453818237824e-02, -1.530182458038999e-02);
+    h_ref_ch21(24) =
+        Kokkos::complex<double>(1.629136145471347e-01, +1.530182458039000e-02);
+
+    EXPECT_EQ(h_ref_ch20(0), h_ch20(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch20(i) - h_ref_ch20(i)),
+                Kokkos::abs(h_ref_ch20(i)) * 1e-13);
+    }
+
+    EXPECT_EQ(h_ref_ch21(0), h_ch21(0));
+    for (int i = 1; i < N; i++) {
+      EXPECT_LE(Kokkos::abs(h_ch21(i) - h_ref_ch21(i)),
+                Kokkos::abs(h_ref_ch21(i)) * 1e-13);
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int& i) const {
+    d_ch20(i) = Kokkos::Experimental::cyl_bessel_h20(d_z(i));
+    d_ch21(i) = Kokkos::Experimental::cyl_bessel_h21(d_z(i));
+  }
+};
+
+TEST(TEST_CATEGORY, mathspecialfunc_expint1) {
+  TestExponentialIntergral1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_errorfunc) {
+  TestComplexErrorFunction<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj0y0) {
+  TestComplexBesselJ0Y0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselj1y1) {
+  TestComplexBesselJ1Y1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli0k0) {
+  TestComplexBesselI0K0Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesseli1k1) {
+  TestComplexBesselI1K1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh1stkind) {
+  TestComplexBesselH1Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+TEST(TEST_CATEGORY, mathspecialfunc_cbesselh2ndkind) {
+  TestComplexBesselH2Function<TEST_EXECSPACE> test;
+  test.testit();
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
index 63895ad47d..829e8d641a 100644
--- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp
+++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp
@@ -50,7 +50,7 @@
 #include <cmath>
 #include <algorithm>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 namespace TestMemoryPool {
 
diff --git a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
index 6c8a47a586..d7607c4f71 100644
--- a/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
+++ b/lib/kokkos/core/unit_test/TestNonTrivialScalarTypes.hpp
@@ -48,7 +48,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -310,6 +310,46 @@ struct array_reduce {
     return lsum;
   }
 };
+
+struct point_t {
+  uint8_t x, y, z;
+
+  KOKKOS_FUNCTION
+  point_t() : x(1), y(1), z(1){};
+
+  KOKKOS_FUNCTION
+  point_t(const point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const volatile point_t &val) : x(val.x), y(val.y), z(val.z){};
+
+  KOKKOS_FUNCTION
+  point_t(const int rhs) { x = y = z = static_cast<uint8_t>(rhs); }
+
+  KOKKOS_FUNCTION
+  explicit operator int() const { return static_cast<int>(x + y + z); }
+
+  KOKKOS_FUNCTION
+  bool operator==(const volatile point_t rhs) const volatile {
+    return (x == rhs.x && y == rhs.y && z == rhs.z);
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(point_t rhs) volatile {
+    x = rhs.x;
+    y = rhs.y;
+    z = rhs.z;
+  }
+
+  KOKKOS_FUNCTION
+  volatile point_t operator+=(const volatile point_t rhs) volatile {
+    x += rhs.x;
+    y += rhs.y;
+    z += rhs.z;
+    return *this;
+  }
+};
+
 }  // namespace Test
 
 namespace Kokkos {
@@ -334,5 +374,21 @@ struct reduction_identity<Test::array_reduce<scalar_t, N>> {
     return Test::array_reduce<scalar_t, N>(t_red_ident::prod());
   }
 };
+
+template <>
+struct reduction_identity<Test::point_t> {
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t sum() noexcept {
+    return 0;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t prod() noexcept {
+    return 1;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t max() noexcept {
+    return 0xff;
+  }
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static uint8_t min() noexcept {
+    return 0x0;
+  }
+};
 }  // namespace Kokkos
 #endif  // TESTNONTRIVIALSCALARTYPES_HPP_
diff --git a/lib/kokkos/core/unit_test/TestNumericTraits.hpp b/lib/kokkos/core/unit_test/TestNumericTraits.hpp
index fe01b83834..cb69cb8321 100644
--- a/lib/kokkos/core/unit_test/TestNumericTraits.hpp
+++ b/lib/kokkos/core/unit_test/TestNumericTraits.hpp
@@ -46,6 +46,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <type_traits>
+#include <limits>
 #include "Kokkos_NumericTraits.hpp"
 #include "Kokkos_ExecPolicy.hpp"
 
@@ -198,7 +199,9 @@ struct TestNumericTraits<
 TEST(TEST_CATEGORY, numeric_traits_infinity) {
   TestNumericTraits<TEST_EXECSPACE, float, Infinity>();
   TestNumericTraits<TEST_EXECSPACE, double, Infinity>();
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1 see issue #4100
   TestNumericTraits<TEST_EXECSPACE, long double, Infinity>();
+#endif
 }
 
 TEST(TEST_CATEGORY, numeric_traits_epsilon) {
@@ -334,3 +337,182 @@ TEST(TEST_CATEGORY, numeric_traits_min_max_exponent10) {
   TestNumericTraits<TEST_EXECSPACE, long double, MinExponent10>();
   TestNumericTraits<TEST_EXECSPACE, long double, MaxExponent10>();
 }
+
+namespace NumericTraitsSFINAE {
+
+struct HasNoSpecialization {};
+
+#define CHECK_TRAIT_IS_SFINAE_FRIENDLY(TRAIT)                              \
+  template <class T>                                                       \
+  using TRAIT##_value_t = decltype(Kokkos::Experimental::TRAIT<T>::value); \
+  template <class T>                                                       \
+  using has_##TRAIT = Kokkos::is_detected<TRAIT##_value_t, T>;             \
+  static_assert(!has_##TRAIT<HasNoSpecialization>::value, "");
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(infinity)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_min)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(finite_max)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(epsilon)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(round_error)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(norm_min)
+
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_digits10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(radix)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(min_exponent10)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent)
+CHECK_TRAIT_IS_SFINAE_FRIENDLY(max_exponent10)
+
+}  // namespace NumericTraitsSFINAE
+
+// Example detecting presence or absence of values
+template <class T>
+using infinity_value_t = decltype(Kokkos::Experimental::infinity<T>::value);
+
+template <class T>
+using has_infinity = Kokkos::is_detected<infinity_value_t, T>;
+
+template <class T, std::enable_if_t<has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return Kokkos::Experimental::infinity<T>::value;
+}
+
+template <class T, std::enable_if_t<!has_infinity<T>::value>* = nullptr>
+constexpr T legacy_std_numeric_limits_infinity() {
+  return T();
+}
+
+TEST(TEST_CATEGORY, numeric_traits_sfinae_friendly) {
+  ASSERT_EQ(legacy_std_numeric_limits_infinity<int>(), 0);
+}
+
+// Compare to std::numeric_limits
+template <int V1, int V2>
+struct AssertIntEquality {
+  static constexpr bool value = false;
+};
+template <int V>
+struct AssertIntEquality<V, V> {
+  static constexpr bool value = true;
+};
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(T, TRAIT)           \
+  static_assert(AssertIntEquality<Kokkos::Experimental::TRAIT<T>::value, \
+                                  std::numeric_limits<T>::TRAIT>::value, \
+                "")
+#define CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(T, TRAIT) \
+  static_assert(Kokkos::Experimental::TRAIT<T>::value ==       \
+                    std::numeric_limits<T>::TRAIT(),           \
+                "")
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, infinity);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, epsilon);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, epsilon);
+#ifndef KOKKOS_COMPILER_IBM  // fails with XL 16.1.1
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, epsilon);
+#endif
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(float, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(double, round_error);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION(long double, round_error);
+// clang-format off
+static_assert(Kokkos::Experimental::norm_min<float      >::value == std::numeric_limits<      float>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<double     >::value == std::numeric_limits<     double>::min(), "");
+static_assert(Kokkos::Experimental::norm_min<long double>::value == std::numeric_limits<long double>::min(), "");
+// integer types
+static_assert(Kokkos::Experimental::finite_min<char                  >::value == std::numeric_limits<                  char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<signed char           >::value == std::numeric_limits<           signed char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned char         >::value == std::numeric_limits<         unsigned char>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<short                 >::value == std::numeric_limits<                 short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned short        >::value == std::numeric_limits<        unsigned short>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<int                   >::value == std::numeric_limits<                   int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned int          >::value == std::numeric_limits<          unsigned int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long int              >::value == std::numeric_limits<              long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<long long int         >::value == std::numeric_limits<         long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_min<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::min(), "");
+static_assert(Kokkos::Experimental::finite_max<char                  >::value == std::numeric_limits<                  char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<signed char           >::value == std::numeric_limits<           signed char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned char         >::value == std::numeric_limits<         unsigned char>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<short                 >::value == std::numeric_limits<                 short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned short        >::value == std::numeric_limits<        unsigned short>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<int                   >::value == std::numeric_limits<                   int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned int          >::value == std::numeric_limits<          unsigned int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long int              >::value == std::numeric_limits<              long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long int     >::value == std::numeric_limits<     unsigned long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long long int         >::value == std::numeric_limits<         long long int>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<unsigned long long int>::value == std::numeric_limits<unsigned long long int>::max(), "");
+// floating point types
+static_assert(Kokkos::Experimental::finite_min<float      >::value == -std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<double     >::value == -std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_min<long double>::value == -std::numeric_limits<long double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<float      >::value ==  std::numeric_limits<      float>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<double     >::value ==  std::numeric_limits<     double>::max(), "");
+static_assert(Kokkos::Experimental::finite_max<long double>::value ==  std::numeric_limits<long double>::max(), "");
+// clang-format on
+
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_digits10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(bool, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(signed char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned char, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned short, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(unsigned long long int, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, radix);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(float, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(double, max_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, min_exponent10);
+CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT(long double, max_exponent10);
+
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_FUNCTION
+#undef CHECK_SAME_AS_NUMERIC_LIMITS_MEMBER_CONSTANT
diff --git a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
index 0017c690e7..d75d78b31f 100644
--- a/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
+++ b/lib/kokkos/core/unit_test/TestPolicyConstruction.hpp
@@ -291,34 +291,34 @@ class TestRangePolicyConstruction {
     using policy_t = Kokkos::RangePolicy<>;
     {
       policy_t p(5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15);
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
     }
     {
       policy_t p(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p(Kokkos::DefaultExecutionSpace(), 5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
     {
       policy_t p;
-      ASSERT_TRUE((p.begin() == 0));
-      ASSERT_TRUE((p.end() == 0));
+      ASSERT_EQ(p.begin(), 0);
+      ASSERT_EQ(p.end(), 0);
       p = policy_t(5, 15, Kokkos::ChunkSize(10));
-      ASSERT_TRUE((p.begin() == 5));
-      ASSERT_TRUE((p.end() == 15));
-      ASSERT_TRUE((p.chunk_size() == 10));
+      ASSERT_EQ(p.begin(), 5);
+      ASSERT_EQ(p.end(), 15);
+      ASSERT_EQ(p.chunk_size(), 10);
     }
   }
 };
@@ -582,7 +582,7 @@ class TestTeamPolicyConstruction {
     ASSERT_EQ(p1.team_size(), team_size);
 // FIXME_SYCL implement chunk_size
 #ifndef KOKKOS_ENABLE_SYCL
-    ASSERT_TRUE(p1.chunk_size() > 0);
+    ASSERT_GT(p1.chunk_size(), 0);
 #endif
     ASSERT_EQ(p1.scratch_size(0), 0);
 
@@ -795,7 +795,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert(sizeof(decltype(policy)) == 1, "");
   static_assert_dummy_policy_must_be_size_one<sizeof(decltype(policy))>
       _assert1{};
-  (void)_assert1;  // avoid unused variable warning
+  (void)&_assert1;  // avoid unused variable warning
 
   using Kokkos::Experimental::DesiredOccupancy;
   auto policy_with_occ =
@@ -805,7 +805,7 @@ TEST(TEST_CATEGORY, desired_occupancy_empty_base_optimization) {
   static_assert_dummy_policy_must_be_size_of_desired_occupancy<
       sizeof(decltype(policy_with_occ)), sizeof(DesiredOccupancy)>
       _assert2{};
-  (void)_assert2;  // avoid unused variable warning
+  (void)&_assert2;  // avoid unused variable warning
 }
 
 template <typename Policy>
diff --git a/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp b/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
new file mode 100644
index 0000000000..e45d84e7e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestQuadPrecisionMath.hpp
@@ -0,0 +1,109 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_LIBQUADMATH
+
+#include <impl/Kokkos_QuadPrecisionMath.hpp>
+#include <Kokkos_Core.hpp>
+
+#include <gtest/gtest.h>
+
+// FIXME instantiate only once for default host execution space
+TEST(TEST_CATEGORY, quad_precision_reductions) {
+  int const n = 100;
+  __float128 r;
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      r);
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v += static_cast<__float128>(i); },
+      Kokkos::Sum<__float128>(r));
+  EXPECT_EQ(r, n * (n - 1) / 2);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v > static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Min<__float128>(r));
+  EXPECT_EQ(r, 0);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) {
+        if (v < static_cast<__float128>(i)) {
+          v = static_cast<__float128>(i);
+        }
+      },
+      Kokkos::Max<__float128>(r));
+  EXPECT_EQ(r, n - 1);
+
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(1, n),
+      KOKKOS_LAMBDA(int i, __float128 &v) { v *= static_cast<__float128>(i); },
+      Kokkos::Prod<__float128>(r));
+  EXPECT_FLOAT_EQ(r, tgammaq(n + 1));  // factorial(n) = tgamma(n+1)
+}
+
+TEST(TEST_CATEGORY, quad_precision_common_math_functions) {
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        (void)Kokkos::Experimental::fabs((__float128)0);
+        (void)Kokkos::Experimental::sqrt((__float128)1);
+        (void)Kokkos::Experimental::exp((__float128)2);
+        (void)Kokkos::Experimental::sin((__float128)3);
+        (void)Kokkos::Experimental::cosh((__float128)4);
+      });
+}
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
index a6a6220f2d..d6b5d8fecc 100644
--- a/lib/kokkos/core/unit_test/TestRange.hpp
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -317,10 +317,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -361,10 +361,10 @@ struct TestRange {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp b/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
index 693f19613d..508b7192cb 100644
--- a/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
+++ b/lib/kokkos/core/unit_test/TestRangePolicyRequire.hpp
@@ -309,10 +309,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
@@ -353,10 +353,10 @@ struct TestRangeRequire {
           if (count(t) < min) min = count(t);
           if (count(t) > max) max = count(t);
         }
-        ASSERT_TRUE(min < max);
+        ASSERT_LT(min, max);
 
         // if ( ExecSpace::concurrency() > 2 ) {
-        //  ASSERT_TRUE( 2 * min < max );
+        //  ASSERT_LT( 2 * min, max );
         //}
       }
     }
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
index 5f7fbd5623..81e063f83e 100644
--- a/lib/kokkos/core/unit_test/TestReduce.hpp
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -539,6 +539,10 @@ class TestReduceDynamicView {
 
 }  // namespace
 
+// FIXME_OPENMPTARGET : The feature works with LLVM/13 on NVIDIA
+// architectures. The jenkins currently tests with LLVM/12.
+#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \
+    (KOKKOS_COMPILER_CLANG >= 1300)
 TEST(TEST_CATEGORY, int64_t_reduce) {
   TestReduce<int64_t, TEST_EXECSPACE>(0);
   TestReduce<int64_t, TEST_EXECSPACE>(1000000);
@@ -563,7 +567,10 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) {
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(0);
   TestReduceDynamicView<int64_t, TEST_EXECSPACE>(1000000);
 }
+#endif
 
+// FIXME_OPENMPTARGET: Not yet implemented.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
 TEST(TEST_CATEGORY, int_combined_reduce) {
   using functor_type = CombinedReduceFunctorSameType<int64_t, TEST_EXECSPACE>;
   constexpr uint64_t nw = 1000;
@@ -626,4 +633,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) {
   ASSERT_EQ(nsum, result2);
   ASSERT_EQ(nsum, result3_v());
 }
+#endif
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp b/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
index 68e7d746dd..4664f26559 100644
--- a/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
+++ b/lib/kokkos/core/unit_test/TestReduceCombinatorical.hpp
@@ -439,11 +439,11 @@ struct TestReduceCombinatoricalInstantiation {
                        Test::ReduceCombinatorical::AddPlus<double>(value));
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
@@ -453,11 +453,11 @@ struct TestReduceCombinatoricalInstantiation {
     CallParallelReduce(args..., add);
     if ((Kokkos::DefaultExecutionSpace::concurrency() > 1) &&
         (ExecSpace::concurrency() > 1) && (expected_result > 0)) {
-      ASSERT_TRUE(expected_result < value);
+      ASSERT_LT(expected_result, value);
     } else if (((Kokkos::DefaultExecutionSpace::concurrency() > 1) ||
                 (ExecSpace::concurrency() > 1)) &&
                (expected_result > 0)) {
-      ASSERT_TRUE(expected_result <= value);
+      ASSERT_LE(expected_result, value);
     } else {
       ASSERT_EQ(expected_result, value);
     }
diff --git a/lib/kokkos/core/unit_test/TestReducers.hpp b/lib/kokkos/core/unit_test/TestReducers.hpp
index 35f0e231fd..0d5f7fe7ba 100644
--- a/lib/kokkos/core/unit_test/TestReducers.hpp
+++ b/lib/kokkos/core/unit_test/TestReducers.hpp
@@ -296,7 +296,8 @@ struct TestReducers {
     Scalar reference_sum = 0;
 
     for (int i = 0; i < N; i++) {
-      h_values(i) = (Scalar)(rand() % 100);
+      int denom   = sizeof(Scalar) <= 2 ? 10 : 100;
+      h_values(i) = (Scalar)(rand() % denom);
       reference_sum += h_values(i);
     }
     Kokkos::deep_copy(values, h_values);
diff --git a/lib/kokkos/core/unit_test/TestReducers_d.hpp b/lib/kokkos/core/unit_test/TestReducers_d.hpp
index e2254a1c1f..2d5802cdd4 100644
--- a/lib/kokkos/core/unit_test/TestReducers_d.hpp
+++ b/lib/kokkos/core/unit_test/TestReducers_d.hpp
@@ -64,4 +64,49 @@ TEST(TEST_CATEGORY, reducers_struct) {
   TestReducers<array_reduce<float, 7>, TEST_EXECSPACE>::test_sum(1031);
 #endif
 }
+
+TEST(TEST_CATEGORY, reducers_half_t) {
+  using ThisTestType = Kokkos::Experimental::half_t;
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(101);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(202);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(303);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(5);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(10);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(15);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(20);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(25);
+}
+
+TEST(TEST_CATEGORY, reducers_int8_t) {
+  using ThisTestType = int8_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_prod(4);
+}
+
+#if !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_OPENMPTARGET)
+// TODO - resolve: "Kokkos_HIP_Vectorization.hpp:80:15: error: call to
+//                 implicitly-deleted default constructor of 'conv_type'
+//                   conv_type tmp_in;"
+//
+// TODO - resolve:  4: [  FAILED  ] openmptarget.reducers_point_t (1 ms)
+TEST(TEST_CATEGORY, reducers_point_t) {
+  using ThisTestType = point_t;
+
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(1);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(2);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(3);
+  TestReducers<ThisTestType, TEST_EXECSPACE>::test_sum(4);
+}
+#endif  // !KOKKOS_ENABLE_HIP && !KOKKOS_ENABLE_OPENMPTARGET
+
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestReductions.hpp b/lib/kokkos/core/unit_test/TestReductions.hpp
index 949ca7eaf3..1fa8a2e92e 100644
--- a/lib/kokkos/core/unit_test/TestReductions.hpp
+++ b/lib/kokkos/core/unit_test/TestReductions.hpp
@@ -45,8 +45,6 @@
 #ifndef KOKKOS_TEST_REDUCTIONS_HPP
 #define KOKKOS_TEST_REDUCTIONS_HPP
 #include <Kokkos_Macros.hpp>
-#ifndef KOKKOS_ENABLE_OPENMPTARGET
 #include <TestReduce.hpp>
-#endif
 #include <TestCXX11Deduction.hpp>
 #endif
diff --git a/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp b/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
index 17563de335..6ffa11b11c 100644
--- a/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
+++ b/lib/kokkos/core/unit_test/TestReductions_DeviceView.hpp
@@ -32,11 +32,17 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
   typename ExecSpace::execution_space().fence();
   double time_fence0 = timer.seconds();
   Kokkos::deep_copy(result, 0);
+
+  // We need a warm-up to get reasonable results
+  Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
+                          functor,
+                          Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
+  Kokkos::fence();
+
   timer.reset();
   bool is_async = time0 < time_fence0;
 
   // Test Reducer
-
   Kokkos::parallel_reduce("Test::ReduceDeviceView::TestReducer", policy,
                           functor,
                           Kokkos::Sum<int64_t, TEST_EXECSPACE>(result));
@@ -75,11 +81,11 @@ void test_reduce_device_view(int64_t N, PolicyType policy,
 
   ASSERT_EQ(N, scalar_result);
   if (is_async) {
-    ASSERT_TRUE(time1 < time_fence1);
+    ASSERT_LT(time1, time_fence1);
   }
   if (is_async) {
-    ASSERT_TRUE(time2 < time_fence2);
-    ASSERT_TRUE(time3 > time_fence3);
+    ASSERT_LT(time2, time_fence2);
+    ASSERT_GT(time3, time_fence3);
   }
 }
 
@@ -128,8 +134,6 @@ TEST(TEST_CATEGORY, reduce_device_view_mdrange_policy) {
       MDRangePolicyFunctor());
 }
 
-// FIXME_HIP
-#ifndef KOKKOS_ENABLE_HIP
 TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
 // FIXME_SYCL The number of workgroups on CUDA devices can not be larger than
 // 65535
@@ -145,5 +149,4 @@ TEST(TEST_CATEGORY, reduce_device_view_team_policy) {
       TeamPolicyFunctor(1024));
 #endif
 }
-#endif
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestStackTrace.hpp b/lib/kokkos/core/unit_test/TestStackTrace.hpp
index 284332f3f8..d34d0f92e9 100644
--- a/lib/kokkos/core/unit_test/TestStackTrace.hpp
+++ b/lib/kokkos/core/unit_test/TestStackTrace.hpp
@@ -73,10 +73,10 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
 
     if (bDynamic) {
       printf("test_f1: %s \n", foutput.c_str());
-      ASSERT_TRUE(std::string::npos != foutput.find("stacktrace_test_f1"));
+      ASSERT_NE(std::string::npos, foutput.find("stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -92,7 +92,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
                   foutput.find("Test::stacktrace_test_f1"));
       for (auto x : {"stacktrace_test_f0", "stacktrace_test_f2",
                      "stacktrace_test_f3", "stacktrace_test_f4"}) {
-        ASSERT_TRUE(std::string::npos == foutput.find(x));
+        ASSERT_EQ(std::string::npos, foutput.find(x));
       }
     }
   }
@@ -114,7 +114,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
     // TODO make sure stacktrace_test_f2/4 don't show up
@@ -129,7 +129,7 @@ void test_stacktrace(bool bTerminate, bool bCustom = true) {
       std::string foutput = sstream.str();
       printf("demangled test_f3: %s \n", foutput.c_str());
       for (auto x : {"stacktrace_test_f1", "stacktrace_test_f3"}) {
-        ASSERT_TRUE(std::string::npos != foutput.find(x));
+        ASSERT_NE(std::string::npos, foutput.find(x));
       }
     }
 
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index 97ddfd4cf5..a5e3de85bb 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -137,8 +137,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
         smallest_work, smallest_work, smallest_work);
 #endif
+    (void)none_auto;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
         smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
+    (void)both_auto;
     // FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(smallest_work, 32,
@@ -147,8 +149,10 @@ struct TestTeamPolicy {
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
         smallest_work, smallest_work, Kokkos::AUTO());
 #endif
+    (void)auto_vector;
     Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
         smallest_work, Kokkos::AUTO(), smallest_work);
+    (void)auto_team;
   }
 
   static void test_for(const size_t league_size) {
@@ -970,7 +974,11 @@ struct ClassNoShmemSizeFunction {
                 double *, ExecSpace,
                 Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
-    int team_size = 8;
+#ifdef KOKKOS_ENABLE_SYCL
+    int team_size = 4;
+#else
+    int team_size      = 8;
+#endif
     if (team_size > ExecSpace::concurrency())
       team_size = ExecSpace::concurrency();
     {
@@ -1115,7 +1123,11 @@ void test_team_mulit_level_scratch_test_lambda() {
       Kokkos::View<double *, ExecSpace,
                    Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size(1600);
 
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 4;
+#else
   int team_size = 8;
+#endif
   if (team_size > ExecSpace::concurrency())
     team_size = ExecSpace::concurrency();
 
@@ -1400,7 +1412,7 @@ struct TestTeamBroadcast<
     // above because the functor switches it back.
     bool setValue = ((lid % ts) != tid);
 
-    teamMember.team_broadcast([&](value_type &var) { var *= 2; }, value,
+    teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
                               lid % ts);
     teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
                               lid % ts);
@@ -1465,7 +1477,7 @@ struct TestTeamBroadcast<
     value_type expected_result = 0;
     for (unsigned int i = 0; i < league_size; i++) {
       value_type val =
-          (value_type((i % team_size) * 3) + off) * (value_type)team_size;
+          (value_type((i % team_size) * 3) + off) * value_type(team_size);
       expected_result += val;
     }
     // For comparison purposes treat the reduction as a random walk in the
diff --git a/lib/kokkos/core/unit_test/TestTeamBasic.hpp b/lib/kokkos/core/unit_test/TestTeamBasic.hpp
index 87c010ac2a..17899f63b1 100644
--- a/lib/kokkos/core/unit_test/TestTeamBasic.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamBasic.hpp
@@ -105,6 +105,75 @@ TEST(TEST_CATEGORY, team_broadcast_long) {
                     long>::test_teambroadcast(1000, 1);
 }
 
+// FIXME_OPENMPTARGET CI fails with
+// Libomptarget error: Copying data from device failed.
+// Possibly, because long_wrapper is not trivially-copyable.
+#ifndef KOKKOS_ENABLE_OPENMPTARGET
+struct long_wrapper {
+  long value;
+
+  KOKKOS_FUNCTION
+  long_wrapper() : value(0) {}
+
+  KOKKOS_FUNCTION
+  long_wrapper(long val) : value(val) {}
+
+  KOKKOS_FUNCTION
+  friend void operator+=(long_wrapper& lhs, const long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  friend void operator+=(volatile long_wrapper& lhs,
+                         const volatile long_wrapper& rhs) {
+    lhs.value += rhs.value;
+  }
+
+  KOKKOS_FUNCTION
+  void operator=(const long_wrapper& other) { value = other.value; }
+
+  KOKKOS_FUNCTION
+  void operator=(const volatile long_wrapper& other) volatile {
+    value = other.value;
+  }
+  KOKKOS_FUNCTION
+  operator long() const { return value; }
+};
+}  // namespace Test
+
+namespace Kokkos {
+template <>
+struct reduction_identity<Test::long_wrapper>
+    : public reduction_identity<long> {};
+}  // namespace Kokkos
+
+namespace Test {
+
+// Test for non-arithmetic type
+TEST(TEST_CATEGORY, team_broadcast_long_wrapper) {
+  static_assert(!std::is_arithmetic<long_wrapper>::value, "");
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(0, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(2, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(16, 1);
+
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+  TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic>,
+                    long_wrapper>::test_teambroadcast(1000, 1);
+}
+#endif
+
 TEST(TEST_CATEGORY, team_broadcast_char) {
   {
     TestTeamBroadcast<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static>,
diff --git a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
index 3db0eafa33..836134afe0 100644
--- a/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamReductionScan.hpp
@@ -53,14 +53,8 @@ TEST(TEST_CATEGORY, team_reduction_scan) {
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(0);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10);
   TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10);
-// FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
-    TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
-  }
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >(10000);
+  TestScanTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >(10000);
 }
 
 TEST(TEST_CATEGORY, team_long_reduce) {
diff --git a/lib/kokkos/core/unit_test/TestTeamScratch.hpp b/lib/kokkos/core/unit_test/TestTeamScratch.hpp
index 75ca358762..bab937273d 100644
--- a/lib/kokkos/core/unit_test/TestTeamScratch.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamScratch.hpp
@@ -54,15 +54,8 @@ TEST(TEST_CATEGORY, team_shared_request) {
 }
 
 TEST(TEST_CATEGORY, team_scratch_request) {
-  // FIXME_HIP the parallel_reduce in this test requires a team size larger than
-  // 256. Fixed in ROCm 3.9
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
-    TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Static> >();
+  TestScratchTeam<TEST_EXECSPACE, Kokkos::Schedule<Kokkos::Dynamic> >();
 }
 
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
@@ -78,21 +71,14 @@ TEST(TEST_CATEGORY, scratch_align) { TestScratchAlignment<TEST_EXECSPACE>(); }
 TEST(TEST_CATEGORY, shmem_size) { TestShmemSize<TEST_EXECSPACE>(); }
 
 TEST(TEST_CATEGORY, multi_level_scratch) {
-  // FIXME_HIP the parallel_for and the parallel_reduce in this test requires a
-  // team size larger than 256. Fixed In ROCm 3.9
   // FIXME_OPENMPTARGET This unit test needs ~350KB of scratch memory for L0 and
   // L1 combined per team. Currently OpenMPTarget cannot allocate this high
   // amount of scratch memory.
 #if !defined(KOKKOS_ENABLE_OPENMPTARGET)
-#if defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 309)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value)
-#endif
-  {
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Static> >();
-    TestMultiLevelScratchTeam<TEST_EXECSPACE,
-                              Kokkos::Schedule<Kokkos::Dynamic> >();
-  }
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Static> >();
+  TestMultiLevelScratchTeam<TEST_EXECSPACE,
+                            Kokkos::Schedule<Kokkos::Dynamic> >();
 #endif
 }
 
diff --git a/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp b/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
index 992e80397b..f64c5b8809 100644
--- a/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamTeamSize.hpp
@@ -110,9 +110,9 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
   int team_size_rec_reduce = p.team_size_recommended(
       FunctorReduce<T, N, PolicyType, S>(), Kokkos::ParallelReduceTag());
 
-  ASSERT_TRUE(team_size_max_for >= team_size_rec_for);
-  ASSERT_TRUE(team_size_max_reduce >= team_size_rec_reduce);
-  ASSERT_TRUE(team_size_max_for >= team_size_max_reduce);
+  ASSERT_GE(team_size_max_for, team_size_rec_for);
+  ASSERT_GE(team_size_max_reduce, team_size_rec_reduce);
+  ASSERT_GE(team_size_max_for, team_size_max_reduce);
 
   Kokkos::parallel_for(PolicyType(10000, team_size_max_for, 4)
                            .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
@@ -122,13 +122,6 @@ void test_team_policy_max_recommended_static_size(int scratch_size) {
                        FunctorFor<T, N, PolicyType, S>());
   MyArray<T, N> val;
   double n_leagues = 10000;
-  // FIXME_HIP
-#ifdef KOKKOS_ENABLE_HIP
-  if (N == 2)
-    n_leagues = 1000;
-  else
-    n_leagues = 500;
-#endif
 
   Kokkos::parallel_reduce(
       PolicyType(n_leagues, team_size_max_reduce, 4)
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
index ba11dc07a9..dbed674756 100644
--- a/lib/kokkos/core/unit_test/TestTeamVector.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -111,7 +111,7 @@ struct functor_team_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_parallel_for %i %i %f %f\n", team.league_rank(),
+              "FAILED team_parallel_for %i %i %lf %lf\n", team.league_rank(),
               team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -321,10 +321,9 @@ struct functor_team_vector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_for %i %i %f %f\n",
+              "FAILED team_vector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
-
           flag() = 1;
         }
       });
@@ -372,7 +371,7 @@ struct functor_team_vector_reduce {
       if (test != value) {
         if (team.league_rank() == 0) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+              "FAILED team_vector_parallel_reduce %i %i %lf %lf %lu\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value),
               static_cast<unsigned long>(sizeof(Scalar)));
@@ -424,7 +423,7 @@ struct functor_team_vector_reduce_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED team_vector_parallel_reduce_reducer %i %i %f %f\n",
+            "FAILED team_vector_parallel_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), static_cast<double>(test),
             static_cast<double>(value));
 
@@ -471,8 +470,9 @@ struct functor_vec_single {
 
     if (value2 != (value * Scalar(nEnd - nStart))) {
       KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-          "FAILED vector_single broadcast %i %i %f %f\n", team.league_rank(),
-          team.team_rank(), (double)value2, (double)value);
+          "FAILED vector_single broadcast %i %i %lf %lf\n", team.league_rank(),
+          team.team_rank(), static_cast<double>(value2),
+          static_cast<double>(value));
 
       flag() = 1;
     }
@@ -523,7 +523,7 @@ struct functor_vec_for {
         }
 
         if (test != value) {
-          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %f %f\n",
+          KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_for %i %i %lf %lf\n",
                                         team.league_rank(), team.team_rank(),
                                         static_cast<double>(test),
                                         static_cast<double>(value));
@@ -560,10 +560,9 @@ struct functor_vec_red {
       for (int i = 0; i < 13; i++) test += i;
 
       if (test != value) {
-        KOKKOS_IMPL_DO_NOT_USE_PRINTF("FAILED vector_par_reduce %i %i %f %f\n",
-                                      team.league_rank(), team.team_rank(),
-                                      (double)test, (double)value);
-
+        KOKKOS_IMPL_DO_NOT_USE_PRINTF(
+            "FAILED vector_par_reduce %i %i %lf %lf\n", team.league_rank(),
+            team.team_rank(), (double)test, (double)value);
         flag() = 1;
       }
     });
@@ -600,7 +599,7 @@ struct functor_vec_red_reducer {
 
       if (test != value) {
         KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-            "FAILED vector_par_reduce_reducer %i %i %f %f\n",
+            "FAILED vector_par_reduce_reducer %i %i %lf %lf\n",
             team.league_rank(), team.team_rank(), (double)test, (double)value);
 
         flag() = 1;
@@ -630,9 +629,10 @@ struct functor_vec_scan {
 
                               if (test != val) {
                                 KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-                                    "FAILED vector_par_scan %i %i %f %f\n",
+                                    "FAILED vector_par_scan %i %i %lf %lf\n",
                                     team.league_rank(), team.team_rank(),
-                                    (double)test, (double)val);
+                                    static_cast<double>(test),
+                                    static_cast<double>(val));
 
                                 flag() = 1;
                               }
@@ -723,7 +723,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
@@ -856,7 +861,7 @@ template <typename ScalarType, class DeviceType>
 class TestTripleNestedReduce {
  public:
   using execution_space = DeviceType;
-  using size_type       = typename execution_space::size_type;
+  using size_type = typename execution_space::size_type;
 
   TestTripleNestedReduce(const size_type &, const size_type, const size_type &,
                          const size_type) {}
@@ -1000,17 +1005,24 @@ TEST(TEST_CATEGORY, triple_nested_parallelism) {
 // With KOKKOS_ENABLE_DEBUG enabled, the functor uses too many registers to run
 // with a team size of 32 on GPUs, 16 is the max possible (at least on a K80
 // GPU) See https://github.com/kokkos/kokkos/issues/1513
+// For Intel GPUs, the requested workgroup size is just too large here.
 #if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
-  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value) {
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value)
+#elif defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 32);
     TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 32, 16);
-#if defined(KOKKOS_ENABLE_DEBUG) && defined(KOKKOS_ENABLE_CUDA)
   }
+#if defined(KOKKOS_ENABLE_SYCL)
+  if (!std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value)
 #endif
+  {
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
+    TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
+  }
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 16);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 33);
-  TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 16, 19);
   TestTripleNestedReduce<double, TEST_EXECSPACE>(8192, 2048, 7, 16);
 }
 #endif
diff --git a/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp b/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
index 7342ebad84..c4116b9139 100644
--- a/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
+++ b/lib/kokkos/core/unit_test/TestTeamVectorRange.hpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <iostream>
 #include <cstdlib>
 #include <cstdint>
@@ -280,7 +280,7 @@ struct functor_teamvector_for {
 
         if (test != value) {
           KOKKOS_IMPL_DO_NOT_USE_PRINTF(
-              "FAILED teamvector_parallel_for %i %i %f %f\n",
+              "FAILED teamvector_parallel_for %i %i %lf %lf\n",
               team.league_rank(), team.team_rank(), static_cast<double>(test),
               static_cast<double>(value));
           flag() = 1;
@@ -493,7 +493,12 @@ template <class ExecutionSpace>
 bool Test(int test) {
   bool passed = true;
 
+// With SYCL 33*8 exceeds the maximum work group size
+#ifdef KOKKOS_ENABLE_SYCL
+  int team_size = 31;
+#else
   int team_size = 33;
+#endif
   if (team_size > int(ExecutionSpace::concurrency()))
     team_size = int(ExecutionSpace::concurrency());
   passed = passed && test_scalar<int, ExecutionSpace>(317, team_size, test);
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
index a0bc7c4304..a0d00ded1b 100644
--- a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -138,72 +138,38 @@ struct SumInitJoinFinalValueTypeArray {
   }
 };
 
-template <class Scalar, class ExecutionSpace>
-struct SumWrongInitJoinFinalValueType {
-  using execution_space = ExecutionSpace;
-  using type            = typename Kokkos::View<Scalar*, execution_space>;
-  using value_type      = Scalar;
-
-  type view;
-
-  SumWrongInitJoinFinalValueType(type view_) : view(view_) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void init(double& val) const { val = double(); }
-
-  KOKKOS_INLINE_FUNCTION
-  void join(volatile value_type& val, const value_type& src) const {
-    val += src;
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(int /*i*/, value_type& val) const { val += value_type(); }
-};
-
 template <class Scalar, class ExecutionSpace>
 void TestTemplateMetaFunctions() {
-  using type = typename Kokkos::View<Scalar*, ExecutionSpace>;
-  type a("A", 100);
-  /*
-    int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit< SumPlain<Scalar,
-    ExecutionSpace>, Scalar & >::value; ASSERT_EQ( sum_plain_has_init_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<
-    SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_init_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_init_arg, 0 );
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<SumPlain<Scalar, ExecutionSpace>,
+                                         Scalar&>::value == false,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 
-    //int sum_initjoinfinalvaluetypearray_has_init_arg =
-    Kokkos::Impl::FunctorHasInit< SumInitJoinFinalValueTypeArray<Scalar,
-    ExecutionSpace>, Scalar[] >::value;
-    //ASSERT_EQ( sum_initjoinfinalvaluetypearray_has_init_arg, 1 );
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasInit<
+          SumInitJoinFinalValueTypeArray<Scalar, ExecutionSpace>>::value ==
+          true,
+      "");
 
-    //printf( "Values Init: %i %i %i\n", sum_plain_has_init_arg,
-    sum_initjoinfinalvaluetype_has_init_arg,
-    sum_wronginitjoinfinalvaluetype_has_init_arg );
-
-    int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin< SumPlain<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ( sum_plain_has_join_arg, 0 );
-    int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg, 1 );
-    int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<
-    SumInitJoinFinalValueType2<Scalar, ExecutionSpace>, Scalar >::value;
-    ASSERT_EQ( sum_initjoinfinalvaluetype_has_join_arg2, 1 );
-    int sum_wronginitjoinfinalvaluetype_has_join_arg =
-    Kokkos::Impl::FunctorHasJoin< SumWrongInitJoinFinalValueType<Scalar,
-    ExecutionSpace>, Scalar >::value; ASSERT_EQ(
-    sum_wronginitjoinfinalvaluetype_has_join_arg, 0 );
-
-    //printf( "Values Join: %i %i %i\n", sum_plain_has_join_arg,
-    sum_initjoinfinalvaluetype_has_join_arg,
-    sum_wronginitjoinfinalvaluetype_has_join_arg );
-  */
+  static_assert(Kokkos::Impl::ReduceFunctorHasJoin<
+                    SumPlain<Scalar, ExecutionSpace>>::value == false,
+                "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType<Scalar, ExecutionSpace>>::value == true,
+      "");
+  static_assert(
+      Kokkos::Impl::ReduceFunctorHasJoin<
+          SumInitJoinFinalValueType2<Scalar, ExecutionSpace>>::value == true,
+      "");
 }
 
 }  // namespace
diff --git a/lib/kokkos/core/unit_test/TestTypeList.cpp b/lib/kokkos/core/unit_test/TestTypeList.cpp
new file mode 100644
index 0000000000..e450d11562
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTypeList.cpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <impl/Kokkos_Utilities.hpp>
+
+using TypeList2 = Kokkos::Impl::type_list<void, bool>;
+using TypeList3 = Kokkos::Impl::type_list<char, short, int>;
+using TypeList223 =
+    Kokkos::Impl::type_list<void, bool, void, bool, char, short, int>;
+using TypeList223Void   = Kokkos::Impl::type_list<void, void>;
+using TypeList223NoVoid = Kokkos::Impl::type_list<bool, bool, char, short, int>;
+
+// concat_type_list
+using ConcatTypeList2 = Kokkos::Impl::concat_type_list_t<TypeList2>;
+static_assert(std::is_same<TypeList2, ConcatTypeList2>::value,
+              "concat_type_list of a single type_list failed");
+
+using ConcatTypeList223 =
+    Kokkos::Impl::concat_type_list_t<TypeList2, TypeList2, TypeList3>;
+static_assert(std::is_same<TypeList223, ConcatTypeList223>::value,
+              "concat_type_list of three type_lists failed");
+
+// filter_type_list
+using FilterTypeList223Void =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223>;
+static_assert(std::is_same<TypeList223Void, FilterTypeList223Void>::value,
+              "filter_type_list with predicate value==true failed");
+
+using FilterTypeList223NoVoid =
+    Kokkos::Impl::filter_type_list_t<std::is_void, TypeList223, false>;
+static_assert(std::is_same<TypeList223NoVoid, FilterTypeList223NoVoid>::value,
+              "filter_type_list with predicate value==false failed");
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
index 570281f9fd..73531e6196 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -1060,12 +1060,12 @@ class TestViewAPI {
     dView4 dx, dy, dz;
     hView4 hx, hy, hz;
 
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
-    ASSERT_TRUE(hx.data() == nullptr);
-    ASSERT_TRUE(hy.data() == nullptr);
-    ASSERT_TRUE(hz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
+    ASSERT_EQ(hx.data(), nullptr);
+    ASSERT_EQ(hy.data(), nullptr);
+    ASSERT_EQ(hz.data(), nullptr);
     ASSERT_EQ(dx.extent(0), 0u);
     ASSERT_EQ(dy.extent(0), 0u);
     ASSERT_EQ(dz.extent(0), 0u);
@@ -1116,11 +1116,11 @@ class TestViewAPI {
 
     ASSERT_EQ(dx.use_count(), size_t(2));
 
-    ASSERT_FALSE(dx.data() == nullptr);
-    ASSERT_FALSE(const_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_dx.data() == nullptr);
-    ASSERT_FALSE(unmanaged_from_ptr_dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
+    ASSERT_NE(dx.data(), nullptr);
+    ASSERT_NE(const_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_dx.data(), nullptr);
+    ASSERT_NE(unmanaged_from_ptr_dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
     ASSERT_NE(dx, dy);
 
     ASSERT_EQ(dx.extent(0), unsigned(N0));
@@ -1257,19 +1257,19 @@ class TestViewAPI {
     ASSERT_NE(dx, dz);
 
     dx = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_FALSE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_NE(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dy = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_FALSE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_NE(dz.data(), nullptr);
 
     dz = dView4();
-    ASSERT_TRUE(dx.data() == nullptr);
-    ASSERT_TRUE(dy.data() == nullptr);
-    ASSERT_TRUE(dz.data() == nullptr);
+    ASSERT_EQ(dx.data(), nullptr);
+    ASSERT_EQ(dy.data(), nullptr);
+    ASSERT_EQ(dz.data(), nullptr);
   }
 
   static void run_test_deep_copy_empty() {
@@ -1304,7 +1304,7 @@ class TestViewAPI {
   static void check_auto_conversion_to_const(
       const Kokkos::View<const DataType, device> &arg_const,
       const Kokkos::View<DataType, device> &arg) {
-    ASSERT_TRUE(arg_const == arg);
+    ASSERT_EQ(arg_const, arg);
   }
 
   static void run_test_const() {
@@ -1317,8 +1317,8 @@ class TestViewAPI {
     const_typeX xc = x;
     const_typeR xr = x;
 
-    ASSERT_TRUE(xc == x);
-    ASSERT_TRUE(x == xc);
+    ASSERT_EQ(xc, x);
+    ASSERT_EQ(x, xc);
 
     // For CUDA the constant random access View does not return
     // an lvalue reference due to retrieving through texture cache
@@ -1327,7 +1327,7 @@ class TestViewAPI {
     if (!std::is_same<typename device::execution_space, Kokkos::Cuda>::value)
 #endif
     {
-      ASSERT_TRUE(x.data() == xr.data());
+      ASSERT_EQ(x.data(), xr.data());
     }
 
     // typeX xf = xc; // Setting non-const from const must not compile.
@@ -1440,29 +1440,29 @@ class TestViewAPI {
     const_vector_right_type cvr2 = Kokkos::subview(mv, Kokkos::ALL(), 1);
     const_vector_right_type cvr3 = Kokkos::subview(mv, Kokkos::ALL(), 2);
 
-    ASSERT_TRUE(&v1[0] == &v1(0));
-    ASSERT_TRUE(&v1[0] == &mv(0, 0));
-    ASSERT_TRUE(&v2[0] == &mv(0, 1));
-    ASSERT_TRUE(&v3[0] == &mv(0, 2));
+    ASSERT_EQ(&v1[0], &v1(0));
+    ASSERT_EQ(&v1[0], &mv(0, 0));
+    ASSERT_EQ(&v2[0], &mv(0, 1));
+    ASSERT_EQ(&v3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cv1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cv2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cv3[0] == &mv(0, 2));
+    ASSERT_EQ(&cv1[0], &mv(0, 0));
+    ASSERT_EQ(&cv2[0], &mv(0, 1));
+    ASSERT_EQ(&cv3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&vr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&vr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&vr3[0] == &mv(0, 2));
+    ASSERT_EQ(&vr1[0], &mv(0, 0));
+    ASSERT_EQ(&vr2[0], &mv(0, 1));
+    ASSERT_EQ(&vr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&cvr1[0] == &mv(0, 0));
-    ASSERT_TRUE(&cvr2[0] == &mv(0, 1));
-    ASSERT_TRUE(&cvr3[0] == &mv(0, 2));
+    ASSERT_EQ(&cvr1[0], &mv(0, 0));
+    ASSERT_EQ(&cvr2[0], &mv(0, 1));
+    ASSERT_EQ(&cvr3[0], &mv(0, 2));
 
-    ASSERT_TRUE(&mv1(0, 0) == &mv(1, 2));
-    ASSERT_TRUE(&mv1(1, 1) == &mv(2, 3));
-    ASSERT_TRUE(&mv1(3, 2) == &mv(4, 4));
-    ASSERT_TRUE(&mvr1(0, 0) == &mv_right(1, 2));
-    ASSERT_TRUE(&mvr1(1, 1) == &mv_right(2, 3));
-    ASSERT_TRUE(&mvr1(3, 2) == &mv_right(4, 4));
+    ASSERT_EQ(&mv1(0, 0), &mv(1, 2));
+    ASSERT_EQ(&mv1(1, 1), &mv(2, 3));
+    ASSERT_EQ(&mv1(3, 2), &mv(4, 4));
+    ASSERT_EQ(&mvr1(0, 0), &mv_right(1, 2));
+    ASSERT_EQ(&mvr1(1, 1), &mv_right(2, 3));
+    ASSERT_EQ(&mvr1(3, 2), &mv_right(4, 4));
 
     const_vector_type c_cv1(v1);
     typename vector_type::const_type c_cv2(v2);
diff --git a/lib/kokkos/core/unit_test/TestViewAPI_e.hpp b/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
index a5dc6cf29a..d4f484a530 100644
--- a/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
+++ b/lib/kokkos/core/unit_test/TestViewAPI_e.hpp
@@ -54,23 +54,24 @@ namespace Test {
 TEST(TEST_CATEGORY, view_remap) {
   enum { N0 = 3, N1 = 2, N2 = 8, N3 = 9 };
 
-#ifdef KOKKOS_ENABLE_CUDA
+#if defined(KOKKOS_ENABLE_CUDA)
 #define EXECSPACE                                                     \
   std::conditional<std::is_same<TEST_EXECSPACE, Kokkos::Cuda>::value, \
                    Kokkos::CudaHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#ifdef KOKKOS_ENABLE_HIP
+#elif defined(KOKKOS_ENABLE_HIP)
 #define EXECSPACE                                                     \
   std::conditional<                                                   \
       std::is_same<TEST_EXECSPACE, Kokkos::Experimental::HIP>::value, \
       Kokkos::Experimental::HIPHostPinnedSpace, TEST_EXECSPACE>::type
-#else
-#if defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_SYCL)
+#elif defined(KOKKOS_ENABLE_SYCL)
+#define EXECSPACE                                                      \
+  std::conditional<                                                    \
+      std::is_same<TEST_EXECSPACE, Kokkos::Experimental::SYCL>::value, \
+      Kokkos::Experimental::SYCLHostUSMSpace, TEST_EXECSPACE>::type
+#elif defined(KOKKOS_ENABLE_OPENMPTARGET)
 #define EXECSPACE Kokkos::HostSpace
 #else
 #define EXECSPACE TEST_EXECSPACE
-#endif
-#endif
 #endif
 
   using output_type =
diff --git a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
index e25cb9e39c..ced0aa3828 100644
--- a/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
+++ b/lib/kokkos/core/unit_test/TestViewCopy_a.hpp
@@ -96,10 +96,10 @@ TEST(TEST_CATEGORY, view_copy_tests) {
   auto host = Kokkos::DefaultHostExecutionSpace();
 
   constexpr bool DevExecCanAccessHost =
-      Kokkos::Impl::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
-                                       Kokkos::HostSpace>::accessible;
+      Kokkos::SpaceAccessibility<typename TEST_EXECSPACE::execution_space,
+                                 Kokkos::HostSpace>::accessible;
 
-  constexpr bool HostExecCanAccessDev = Kokkos::Impl::SpaceAccessibility<
+  constexpr bool HostExecCanAccessDev = Kokkos::SpaceAccessibility<
       typename Kokkos::HostSpace::execution_space,
       typename TEST_EXECSPACE::memory_space>::accessible;
 
diff --git a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
index fdbda09917..974d7c98ca 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping_a.hpp
@@ -768,8 +768,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) data[i] = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
@@ -815,8 +815,8 @@ void test_view_mapping() {
 
     ASSERT_EQ(vr1.extent(0), N);
 
-    if (Kokkos::Impl::SpaceAccessibility<
-            Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+    if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                   typename Space::memory_space>::accessible) {
       for (int i = 0; i < N; ++i) vr1(i) = i + 1;
       for (int i = 0; i < N; ++i) ASSERT_EQ(vr1[i], i + 1);
       for (int i = 0; i < N; ++i) ASSERT_EQ(cr1[i], i + 1);
diff --git a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
index 18db67400d..2a15a84380 100644
--- a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp
@@ -81,7 +81,7 @@ struct TestViewMappingSubview {
   using DLT  = Kokkos::View<int** * [13][14], Kokkos::LayoutLeft, ExecSpace>;
   using DLS1 = Kokkos::Subview<DLT, range, int, int, int, int>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DLS1::rank == 1 &&
           std::is_same<typename DLS1::array_layout, Kokkos::LayoutLeft>::value,
@@ -92,7 +92,7 @@ struct TestViewMappingSubview {
   using DRT  = Kokkos::View<int** * [13][14], Kokkos::LayoutRight, ExecSpace>;
   using DRS1 = Kokkos::Subview<DRT, int, int, int, int, range>;
 
-#if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 1000
   static_assert(
       DRS1::rank == 1 &&
           std::is_same<typename DRS1::array_layout, Kokkos::LayoutRight>::value,
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
index 0125017d93..93eb5476b5 100644
--- a/lib/kokkos/core/unit_test/TestViewSubview.hpp
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -184,7 +184,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ONE);
+      ASSERT_EQ(X_h(i, j), ONE);
     }
   }
 
@@ -194,7 +194,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
   }
 
@@ -204,7 +204,7 @@ void test_auto_1d() {
   Kokkos::deep_copy(X_h, X);
   for (size_type j = 0; j < numCols; ++j) {
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == TWO);
+      ASSERT_EQ(X_h(i, j), TWO);
     }
   }
 
@@ -216,7 +216,7 @@ void test_auto_1d() {
     Kokkos::fence();
     Kokkos::deep_copy(X_h, X);
     for (size_type i = 0; i < numRows; ++i) {
-      ASSERT_TRUE(X_h(i, j) == ZERO);
+      ASSERT_EQ(X_h(i, j), ZERO);
     }
 
     for (size_type jj = 0; jj < numCols; ++jj) {
@@ -226,7 +226,7 @@ void test_auto_1d() {
       Kokkos::fence();
       Kokkos::deep_copy(X_h, X);
       for (size_type i = 0; i < numRows; ++i) {
-        ASSERT_TRUE(X_h(i, jj) == ONE);
+        ASSERT_EQ(X_h(i, jj), ONE);
       }
     }
   }
@@ -240,38 +240,38 @@ void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n,
   int col = n > 2 ? 2 : 0;
   int row = m > 2 ? 2 : 0;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     if (a) {
       Kokkos::View<double*, LD, Space> l1da =
           Kokkos::subview(l2d, Kokkos::ALL, row);
-      ASSERT_TRUE(&l1da(0) == &l2d(0, row));
+      ASSERT_EQ(&l1da(0), &l2d(0, row));
       if (n > 1) {
-        ASSERT_TRUE(&l1da(1) == &l2d(1, row));
+        ASSERT_EQ(&l1da(1), &l2d(1, row));
       }
     }
 
     if (b && n > 13) {
       Kokkos::View<double*, LD, Space> l1db =
           Kokkos::subview(l2d, std::pair<unsigned, unsigned>(2, 13), row);
-      ASSERT_TRUE(&l1db(0) == &l2d(2, row));
-      ASSERT_TRUE(&l1db(1) == &l2d(3, row));
+      ASSERT_EQ(&l1db(0), &l2d(2, row));
+      ASSERT_EQ(&l1db(1), &l2d(3, row));
     }
 
     if (c) {
       Kokkos::View<double*, LD, Space> l1dc =
           Kokkos::subview(l2d, col, Kokkos::ALL);
-      ASSERT_TRUE(&l1dc(0) == &l2d(col, 0));
+      ASSERT_EQ(&l1dc(0), &l2d(col, 0));
       if (m > 1) {
-        ASSERT_TRUE(&l1dc(1) == &l2d(col, 1));
+        ASSERT_EQ(&l1dc(1), &l2d(col, 1));
       }
     }
 
     if (d && m > 13) {
       Kokkos::View<double*, LD, Space> l1dd =
           Kokkos::subview(l2d, col, std::pair<unsigned, unsigned>(2, 13));
-      ASSERT_TRUE(&l1dd(0) == &l2d(col, 2));
-      ASSERT_TRUE(&l1dd(1) == &l2d(col, 3));
+      ASSERT_EQ(&l1dd(0), &l2d(col, 2));
+      ASSERT_EQ(&l1dd(1), &l2d(col, 3));
     }
   }
 }
@@ -326,8 +326,8 @@ void test_left_0(bool constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_left_8");
 
     ASSERT_TRUE(x_static_8.span_is_contiguous());
@@ -337,7 +337,7 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x0.span_is_contiguous());
     ASSERT_EQ(x0.span(), 1);
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(constr, x1, x_static_8, Kokkos::pair<int, int>(0, 2), 1, 2, 3,
@@ -345,8 +345,8 @@ void test_left_0(bool constr) {
 
     ASSERT_TRUE(x1.span_is_contiguous());
     ASSERT_EQ(x1.span(), 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x_deg1;
     make_subview(constr, x_deg1, x_static_8, Kokkos::pair<int, int>(0, 0), 1, 2,
@@ -369,10 +369,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
@@ -380,10 +380,10 @@ void test_left_0(bool constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(constr, sx4, x_static_8, 0,
@@ -402,9 +402,8 @@ void test_left_0(bool constr) {
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -420,8 +419,8 @@ void test_left_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_left_8", 2, 3, 4, 5);
 
     ASSERT_TRUE(x8.span_is_contiguous());
@@ -430,15 +429,15 @@ void test_left_1(bool use_constr) {
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1;
     make_subview(use_constr, x1, x8, Kokkos::pair<int, int>(0, 2), 1, 2, 3, 0,
                  1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x8(1, 1, 2, 3, 0, 1, 2, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, Kokkos::pair<int, int>(0, 0), 1, 2, 3,
@@ -461,10 +460,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&x2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&x2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, Kokkos::pair<int, int>(2, 2), 2, 3, 4,
@@ -477,10 +476,10 @@ void test_left_1(bool use_constr) {
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 0), 2, 3,
@@ -520,8 +519,8 @@ template <class Space>
 void test_left_2() {
   using view_type = Kokkos::View<int****, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x4("x4", 2, 3, 4, 5);
 
     ASSERT_TRUE(x4.span_is_contiguous());
@@ -530,35 +529,35 @@ void test_left_2() {
         Kokkos::subview(x4, 0, 0, 0, 0);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &x4(0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x4(0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
-    ASSERT_TRUE(&x1(0) == &x4(0, 1, 2, 3));
-    ASSERT_TRUE(&x1(1) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&x1(0), &x4(0, 1, 2, 3));
+    ASSERT_EQ(&x1(1), &x4(1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 = Kokkos::subview(
         x4, Kokkos::pair<int, int>(0, 2), 1, Kokkos::pair<int, int>(1, 3), 2);
 
     ASSERT_TRUE(!x2.span_is_contiguous());
-    ASSERT_TRUE(&x2(0, 0) == &x4(0, 1, 1, 2));
-    ASSERT_TRUE(&x2(1, 0) == &x4(1, 1, 1, 2));
-    ASSERT_TRUE(&x2(0, 1) == &x4(0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x4(1, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x4(0, 1, 1, 2));
+    ASSERT_EQ(&x2(1, 0), &x4(1, 1, 1, 2));
+    ASSERT_EQ(&x2(0, 1), &x4(0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x4(1, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutLeft, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2 = Kokkos::subview(
         x4, 1, Kokkos::pair<int, int>(0, 2), 2, Kokkos::pair<int, int>(1, 4));
 
     ASSERT_TRUE(!sx2.span_is_contiguous());
-    ASSERT_TRUE(&sx2(0, 0) == &x4(1, 0, 2, 1));
-    ASSERT_TRUE(&sx2(1, 0) == &x4(1, 1, 2, 1));
-    ASSERT_TRUE(&sx2(0, 1) == &x4(1, 0, 2, 2));
-    ASSERT_TRUE(&sx2(1, 1) == &x4(1, 1, 2, 2));
-    ASSERT_TRUE(&sx2(0, 2) == &x4(1, 0, 2, 3));
-    ASSERT_TRUE(&sx2(1, 2) == &x4(1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x4(1, 0, 2, 1));
+    ASSERT_EQ(&sx2(1, 0), &x4(1, 1, 2, 1));
+    ASSERT_EQ(&sx2(0, 1), &x4(1, 0, 2, 2));
+    ASSERT_EQ(&sx2(1, 1), &x4(1, 1, 2, 2));
+    ASSERT_EQ(&sx2(0, 2), &x4(1, 0, 2, 3));
+    ASSERT_EQ(&sx2(1, 2), &x4(1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4 =
         Kokkos::subview(x4, Kokkos::pair<int, int>(1, 2) /* of [2] */
@@ -586,8 +585,8 @@ template <class Space>
 void test_left_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutLeft, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -595,14 +594,14 @@ void test_left_3() {
     Kokkos::View<int, Kokkos::LayoutLeft, Space> x0 = Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutLeft, Space> x1 =
         Kokkos::subview(xm, Kokkos::ALL, 3);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(0)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(i, 3));
+      ASSERT_EQ(&x1(i), &xm(i, 3));
     }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2 =
@@ -611,7 +610,7 @@ void test_left_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2c =
@@ -620,20 +619,20 @@ void test_left_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2c(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutLeft, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -644,46 +643,46 @@ void test_right_0(bool use_constr) {
   using view_static_8_type =
       Kokkos::View<int[2][3][4][5][2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_static_8_type x_static_8("x_static_right_8");
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x_static_8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x_static_8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x_static_8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x1.extent(0) == 2);
-    ASSERT_TRUE(&x1(0) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(x1.extent(0), 2);
+    ASSERT_EQ(&x1(0), &x_static_8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x_static_8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2;
     make_subview(use_constr, x2, x_static_8, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3), 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(x2.extent(0) == 2);
-    ASSERT_TRUE(x2.extent(1) == 2);
-    ASSERT_TRUE(&x2(0, 0) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(x2.extent(0), 2);
+    ASSERT_EQ(x2.extent(1), 2);
+    ASSERT_EQ(&x2(0, 0), &x_static_8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x_static_8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x_static_8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x_static_8(0, 1, 2, 2, 0, 1, 2, 2));
 
     // Kokkos::View< int**, Kokkos::LayoutRight, Space > error_2 =
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2;
     make_subview(use_constr, sx2, x_static_8, 1, Kokkos::pair<int, int>(0, 2),
                  2, 3, Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(sx2.extent(0) == 2);
-    ASSERT_TRUE(sx2.extent(1) == 2);
-    ASSERT_TRUE(&sx2(0, 0) == &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(sx2.extent(0), 2);
+    ASSERT_EQ(sx2.extent(1), 2);
+    ASSERT_EQ(&sx2(0, 0), &x_static_8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x_static_8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x_static_8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x_static_8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int****, Kokkos::LayoutStride, Space> sx4;
     make_subview(use_constr, sx4, x_static_8, 0,
@@ -696,17 +695,16 @@ void test_right_0(bool use_constr) {
                  2, Kokkos::pair<int, int>(2, 4) /* of [5] */
     );
 
-    ASSERT_TRUE(sx4.extent(0) == 2);
-    ASSERT_TRUE(sx4.extent(1) == 2);
-    ASSERT_TRUE(sx4.extent(2) == 2);
-    ASSERT_TRUE(sx4.extent(3) == 2);
+    ASSERT_EQ(sx4.extent(0), 2);
+    ASSERT_EQ(sx4.extent(1), 2);
+    ASSERT_EQ(sx4.extent(2), 2);
+    ASSERT_EQ(sx4.extent(3), 2);
     for (int i0 = 0; i0 < (int)sx4.extent(0); ++i0)
       for (int i1 = 0; i1 < (int)sx4.extent(1); ++i1)
         for (int i2 = 0; i2 < (int)sx4.extent(2); ++i2)
           for (int i3 = 0; i3 < (int)sx4.extent(3); ++i3) {
-            ASSERT_TRUE(&sx4(i0, i1, i2, i3) == &x_static_8(0, 0 + i0, 1,
-                                                            1 + i1, 1, 0 + i2,
-                                                            2, 2 + i3));
+            ASSERT_EQ(&sx4(i0, i1, i2, i3),
+                      &x_static_8(0, 0 + i0, 1, 1 + i1, 1, 0 + i2, 2, 2 + i3));
           }
   }
 }
@@ -722,21 +720,21 @@ void test_right_1(bool use_constr) {
   using view_type =
       Kokkos::View<int*** * [2][3][4][5], Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type x8("x_right_8", 2, 3, 4, 5);
 
     Kokkos::View<int, Kokkos::LayoutRight, Space> x0;
     make_subview(use_constr, x0, x8, 0, 0, 0, 0, 0, 0, 0, 0);
 
-    ASSERT_TRUE(&x0() == &x8(0, 0, 0, 0, 0, 0, 0, 0));
+    ASSERT_EQ(&x0(), &x8(0, 0, 0, 0, 0, 0, 0, 0));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1;
     make_subview(use_constr, x1, x8, 0, 1, 2, 3, 0, 1, 2,
                  Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x1(0) == &x8(0, 1, 2, 3, 0, 1, 2, 1));
-    ASSERT_TRUE(&x1(1) == &x8(0, 1, 2, 3, 0, 1, 2, 2));
+    ASSERT_EQ(&x1(0), &x8(0, 1, 2, 3, 0, 1, 2, 1));
+    ASSERT_EQ(&x1(1), &x8(0, 1, 2, 3, 0, 1, 2, 2));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1_deg1;
     make_subview(use_constr, x1_deg1, x8, 0, 1, 2, 3, 0, 1, 2,
@@ -747,10 +745,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, x2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3), 0,
                  1, 2, Kokkos::pair<int, int>(1, 3));
 
-    ASSERT_TRUE(&x2(0, 0) == &x8(0, 1, 2, 1, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(1, 0) == &x8(0, 1, 2, 2, 0, 1, 2, 1));
-    ASSERT_TRUE(&x2(0, 1) == &x8(0, 1, 2, 1, 0, 1, 2, 2));
-    ASSERT_TRUE(&x2(1, 1) == &x8(0, 1, 2, 2, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(0, 0), &x8(0, 1, 2, 1, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(1, 0), &x8(0, 1, 2, 2, 0, 1, 2, 1));
+    ASSERT_EQ(&x2(0, 1), &x8(0, 1, 2, 1, 0, 1, 2, 2));
+    ASSERT_EQ(&x2(1, 1), &x8(0, 1, 2, 2, 0, 1, 2, 2));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_deg2;
     make_subview(use_constr, x2_deg2, x8, 0, 1, 2, Kokkos::pair<int, int>(1, 3),
@@ -762,10 +760,10 @@ void test_right_1(bool use_constr) {
     make_subview(use_constr, sx2, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
                  Kokkos::pair<int, int>(0, 2), 1, 2, 3);
 
-    ASSERT_TRUE(&sx2(0, 0) == &x8(1, 0, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 0) == &x8(1, 1, 2, 3, 0, 1, 2, 3));
-    ASSERT_TRUE(&sx2(0, 1) == &x8(1, 0, 2, 3, 1, 1, 2, 3));
-    ASSERT_TRUE(&sx2(1, 1) == &x8(1, 1, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 0), &x8(1, 0, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 0), &x8(1, 1, 2, 3, 0, 1, 2, 3));
+    ASSERT_EQ(&sx2(0, 1), &x8(1, 0, 2, 3, 1, 1, 2, 3));
+    ASSERT_EQ(&sx2(1, 1), &x8(1, 1, 2, 3, 1, 1, 2, 3));
 
     Kokkos::View<int**, Kokkos::LayoutStride, Space> sx2_deg;
     make_subview(use_constr, sx2_deg, x8, 1, Kokkos::pair<int, int>(0, 2), 2, 3,
@@ -803,8 +801,8 @@ template <class Space>
 void test_right_3() {
   using view_type = Kokkos::View<int**, Kokkos::LayoutRight, Space>;
 
-  if (Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, typename Space::memory_space>::accessible) {
+  if (Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 typename Space::memory_space>::accessible) {
     view_type xm("x4", 10, 5);
 
     ASSERT_TRUE(xm.span_is_contiguous());
@@ -813,14 +811,14 @@ void test_right_3() {
         Kokkos::subview(xm, 5, 3);
 
     ASSERT_TRUE(x0.span_is_contiguous());
-    ASSERT_TRUE(&x0() == &xm(5, 3));
+    ASSERT_EQ(&x0(), &xm(5, 3));
 
     Kokkos::View<int*, Kokkos::LayoutRight, Space> x1 =
         Kokkos::subview(xm, 3, Kokkos::ALL);
 
     ASSERT_TRUE(x1.span_is_contiguous());
     for (int i = 0; i < int(xm.extent(1)); ++i) {
-      ASSERT_TRUE(&x1(i) == &xm(3, i));
+      ASSERT_EQ(&x1(i), &xm(3, i));
     }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2c =
@@ -829,7 +827,7 @@ void test_right_3() {
     ASSERT_TRUE(x2c.span_is_contiguous());
     for (int j = 0; j < int(x2c.extent(1)); ++j)
       for (int i = 0; i < int(x2c.extent(0)); ++i) {
-        ASSERT_TRUE(&x2c(i, j) == &xm(1 + i, j));
+        ASSERT_EQ(&x2c(i, j), &xm(1 + i, j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2 =
@@ -838,20 +836,20 @@ void test_right_3() {
     ASSERT_TRUE(!x2.span_is_contiguous());
     for (int j = 0; j < int(x2.extent(1)); ++j)
       for (int i = 0; i < int(x2.extent(0)); ++i) {
-        ASSERT_TRUE(&x2(i, j) == &xm(i, 2 + j));
+        ASSERT_EQ(&x2(i, j), &xm(i, 2 + j));
       }
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n1 =
         Kokkos::subview(xm, std::pair<int, int>(1, 1), Kokkos::ALL);
 
-    ASSERT_TRUE(x2_n1.extent(0) == 0);
-    ASSERT_TRUE(x2_n1.extent(1) == xm.extent(1));
+    ASSERT_EQ(x2_n1.extent(0), 0);
+    ASSERT_EQ(x2_n1.extent(1), xm.extent(1));
 
     Kokkos::View<int**, Kokkos::LayoutRight, Space> x2_n2 =
         Kokkos::subview(xm, Kokkos::ALL, std::pair<int, int>(1, 1));
 
-    ASSERT_TRUE(x2_n2.extent(0) == xm.extent(0));
-    ASSERT_TRUE(x2_n2.extent(1) == 0);
+    ASSERT_EQ(x2_n2.extent(0), xm.extent(0));
+    ASSERT_EQ(x2_n2.extent(1), 0);
   }
 }
 
@@ -979,7 +977,7 @@ struct CheckSubviewCorrectness_1D_1D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_1D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1005,7 +1003,7 @@ struct CheckSubviewCorrectness_1D_2D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_1D_2D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1033,7 +1031,7 @@ struct CheckSubviewCorrectness_2D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_2D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1068,7 +1066,7 @@ struct CheckSubviewCorrectness_3D_3D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_3D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1107,7 +1105,7 @@ struct CheckSubviewCorrectness_3D_4D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_4D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -1165,7 +1163,7 @@ struct CheckSubviewCorrectness_3D_5D {
     int errors = 0;
     Kokkos::parallel_reduce("CheckSubView_3D_5D", policy_t(0, b.size()), *this,
                             errors);
-    ASSERT_TRUE(errors == 0);
+    ASSERT_EQ(errors, 0);
   }
 
   KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/unit_test/TestView_64bit.hpp b/lib/kokkos/core/unit_test/TestView_64bit.hpp
index 50626718b5..174a07ac1d 100644
--- a/lib/kokkos/core/unit_test/TestView_64bit.hpp
+++ b/lib/kokkos/core/unit_test/TestView_64bit.hpp
@@ -49,9 +49,9 @@ namespace Test {
 template <class Device>
 void test_64bit() {
 #if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
-  // FIXME_SYCL The SYCL CUDA backend throws an error
+  // We are running out of device memory on Intel GPUs
 #ifdef KOKKOS_ENABLE_SYCL
-  int64_t N = 1000000000;
+  int64_t N = 4000000000;
 #else
   int64_t N = 5000000000;
 #endif
@@ -60,7 +60,7 @@ void test_64bit() {
     Kokkos::parallel_reduce(
         Kokkos::RangePolicy<typename Device::execution_space,
                             Kokkos::IndexType<int64_t>>(0, N),
-        KOKKOS_LAMBDA(const int64_t& /*i*/, int64_t& lsum) { lsum += 1; }, sum);
+        KOKKOS_LAMBDA(const int64_t&, int64_t& lsum) { lsum += 1; }, sum);
     ASSERT_EQ(N, sum);
   }
   {
@@ -111,7 +111,12 @@ void test_64bit() {
     ASSERT_EQ(N0 * N1, sum);
   }
   {
-    int N0    = 1024 * 1024 * 1500;
+// We are running out of device memory on Intel GPUs
+#ifdef KOKKOS_ENABLE_SYCL
+    int64_t N0 = 1024 * 1024 * 900;
+#else
+    int N0 = 1024 * 1024 * 1500;
+#endif
     int64_t P = 1713091;
     Kokkos::View<int*, Device> a("A", N0);
     Kokkos::parallel_for(
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
similarity index 89%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
rename to lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
index 316a2b5d0f..0287829fd6 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_a.cpp
+++ b/lib/kokkos/core/unit_test/category_files/TestSYCLHostUSM_Category.hpp
@@ -42,5 +42,12 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
+#ifndef KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+#define KOKKOS_TEST_SYCL_HOST_USM_SPACE_HPP
+
+#include <gtest/gtest.h>
+
+#define TEST_CATEGORY sycl_host_usm
+#define TEST_EXECSPACE Kokkos::Experimental::SYCLHostUSMSpace
+
+#endif
diff --git a/lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp b/lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
similarity index 100%
rename from lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSMSpace_Category.hpp
rename to lib/kokkos/core/unit_test/category_files/TestSYCLSharedUSM_Category.hpp
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index bab29610a3..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index fd227186d5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 669761df97..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index d367fd7e05..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 01b284b2f5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index e15228b1d7..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index 52bbd42f29..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
deleted file mode 100644
index 4aeac8f13f..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
deleted file mode 100644
index e5cb010342..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
deleted file mode 100644
index a52fcb833e..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
deleted file mode 100644
index e345cd9667..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
deleted file mode 100644
index 61547df4f5..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
deleted file mode 100644
index 75a769bb94..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
deleted file mode 100644
index 7d09f5c9f3..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
deleted file mode 100644
index ea03f43bd6..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
deleted file mode 100644
index 1f754e8f49..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
deleted file mode 100644
index 4af7057d2a..0000000000
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestCudaUVM_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index ee7181e118..d09d4edfda 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
   int* p;
-  CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,11 @@ TEST(cuda, raw_cuda_interop) {
   Kokkos::finalize();
 
   offset<<<100, 64>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
 
   std::array<int, 100> h_p;
   cudaMemcpy(h_p.data(), p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +83,6 @@ TEST(cuda, raw_cuda_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  CUDA_SAFE_CALL(cudaFree(p));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(p));
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index 526b985c00..13388b4c54 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -99,12 +99,12 @@ TEST(cuda, raw_cuda_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   cudaStreamDestroy(stream);
 
   int h_p[100];
   cudaMemcpy(h_p, p, sizeof(int) * 100, cudaMemcpyDefault);
-  CUDA_SAFE_CALL(cudaDeviceSynchronize());
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
index 646b379086..2fa61d4312 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@@ -181,37 +181,33 @@ TEST(cuda, space_access) {
   //--------------------------------------
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                        Kokkos::HostSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                                 Kokkos::CudaSpace>::accessible,
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Cuda, Kokkos::CudaSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                           Kokkos::CudaUVMSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaUVMSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::Cuda,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::Cuda,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
-      "");
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                            Kokkos::CudaSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaUVMSpace>::accessible,
+                "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                        Kokkos::CudaSpace>::accessible,
-      "");
-
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaUVMSpace>::accessible,
-      "");
-
-  static_assert(
-      Kokkos::Impl::SpaceAccessibility<Kokkos::HostSpace,
-                                       Kokkos::CudaHostPinnedSpace>::accessible,
+      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                 Kokkos::CudaHostPinnedSpace>::accessible,
       "");
 
   static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
@@ -235,23 +231,23 @@ TEST(cuda, space_access) {
                                             Kokkos::CudaUVMSpace>>::value,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
-                    Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Impl::HostMirror<Kokkos::Cuda>::Space,
+                                 Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::CudaHostPinnedSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
@@ -265,8 +261,8 @@ TEST(cuda, space_access) {
 
 TEST(cuda, uvm) {
   if (Kokkos::CudaUVMSpace::available()) {
-    int *uvm_ptr = (int *)Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>(
-        "uvm_ptr", sizeof(int));
+    int *uvm_ptr = static_cast<int *>(
+        Kokkos::kokkos_malloc<Kokkos::CudaUVMSpace>("uvm_ptr", sizeof(int)));
 
     *uvm_ptr = 42;
 
diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
index 5dcbe566e2..6d6ff0a67b 100644
--- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
+++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType.cpp
@@ -59,17 +59,17 @@ TEST(TEST_CATEGORY, host_space_access) {
   using mirror_space =
       Kokkos::Impl::HostMirror<Kokkos::DefaultExecutionSpace>::Space;
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<host_exec_space,
-                                                 Kokkos::HostSpace>::accessible,
+  static_assert(Kokkos::SpaceAccessibility<host_exec_space,
+                                           Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<device_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<device_space, Kokkos::HostSpace>::accessible,
+      "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<mirror_space,
-                                                 Kokkos::HostSpace>::accessible,
-                "");
+  static_assert(
+      Kokkos::SpaceAccessibility<mirror_space, Kokkos::HostSpace>::accessible,
+      "");
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
index bcd49e69bd..c74090fff9 100644
--- a/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
+++ b/lib/kokkos/core/unit_test/default/TestDefaultDeviceType_d.cpp
@@ -54,12 +54,13 @@
 namespace Test {
 
 TEST(defaultdevicetype, malloc) {
-  int* data = (int*)Kokkos::kokkos_malloc(100 * sizeof(int));
-  ASSERT_NO_THROW(data = (int*)Kokkos::kokkos_realloc(data, 120 * sizeof(int)));
+  int* data = static_cast<int*>(Kokkos::kokkos_malloc(100 * sizeof(int)));
+  ASSERT_NO_THROW(data = static_cast<int*>(
+                      Kokkos::kokkos_realloc(data, 120 * sizeof(int))));
   Kokkos::kokkos_free(data);
 
-  int* data2 = (int*)Kokkos::kokkos_malloc(0);
-  ASSERT_TRUE(data2 == nullptr);
+  int* data2 = static_cast<int*>(Kokkos::kokkos_malloc(0));
+  ASSERT_EQ(data2, nullptr);
   Kokkos::kokkos_free(data2);
 }
 
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
deleted file mode 100644
index 02157836b3..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
deleted file mode 100644
index 80e2fe3f93..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
deleted file mode 100644
index 9694e33ca0..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_c.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_c.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
deleted file mode 100644
index 0d773494ac..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_d.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_d.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
deleted file mode 100644
index cbbbc810b0..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewAPI_e.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewAPI_e.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
deleted file mode 100644
index 444a3e6e95..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
deleted file mode 100644
index f1f90e7acf..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewCopy_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewCopy_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
deleted file mode 100644
index 5e83121e34..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_a.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_a.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
deleted file mode 100644
index c024143d6c..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_b.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_b.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp b/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
deleted file mode 100644
index dcd6c1dc43..0000000000
--- a/lib/kokkos/core/unit_test/hip/TestHIPHostPinned_ViewMapping_subview.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
-//@HEADER
-// ************************************************************************
-//
-//                        Kokkos v. 3.0
-//       Copyright (2020) National Technology & Engineering
-//               Solutions of Sandia, LLC (NTESS).
-//
-// Under the terms of Contract DE-NA0003525 with NTESS,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
-//
-// ************************************************************************
-//@HEADER
-*/
-
-#include <TestHIPHostPinned_Category.hpp>
-#include <TestViewMapping_subview.hpp>
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
index 0a243e0e8e..854f916ba3 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_AsyncLauncher.cpp
@@ -66,8 +66,8 @@ struct TestAsyncLauncher {
 
 TEST(hip, async_launcher) {
   size_t *flag;
-  HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
-  HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&flag, sizeof(size_t)));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMemset(flag, 0, sizeof(size_t)));
   // launch # of cycles * 1000 kernels w/ distinct values
   auto space        = Kokkos::Experimental::HIP();
   auto instance     = space.impl_internal_space_instance();
@@ -80,10 +80,10 @@ TEST(hip, async_launcher) {
   // the sum below should fail
   instance->fence();
   size_t h_flag;
-  HIP_SAFE_CALL(
+  KOKKOS_IMPL_HIP_SAFE_CALL(
       hipMemcpy(&h_flag, flag, sizeof(size_t), hipMemcpyHostToDevice));
   ASSERT_EQ(h_flag, (nkernels * (nkernels - 1)) / 2);
-  HIP_SAFE_CALL(hipFree(flag));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(flag));
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
new file mode 100644
index 0000000000..f382e5b568
--- /dev/null
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_BlocksizeDeduction.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestHIP_Category.hpp>
+
+namespace Test {
+
+struct TestNone {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const { view(i) = i; }
+
+  TestNone() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+struct TestSpiller {
+  Kokkos::View<size_t*, TEST_EXECSPACE> view;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i) const {
+    size_t array[1000] = {0};
+    // and update flag
+    size_t value = 0;
+    for (int ii = i; ii < 1000; ++ii) {
+      array[ii] = value;
+      value += ii;
+    }
+    for (int ii = i; ii < 1000; ++ii) {
+      value *= array[ii];
+    }
+    Kokkos::atomic_add(&view[0], value);
+  }
+
+  TestSpiller() { view = Kokkos::View<size_t*, TEST_EXECSPACE>("dummy", 1); }
+};
+
+TEST(hip, preferred_blocksize_deduction) {
+  using execution_space =
+      typename Kokkos::Impl::FunctorPolicyExecutionSpace<TestSpiller,
+                                                         void>::execution_space;
+  using policy = Kokkos::RangePolicy<execution_space>;
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestNone, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() == 0);
+  }
+
+  {
+    using DriverType = Kokkos::Impl::ParallelFor<TestSpiller, policy>;
+    ASSERT_TRUE(Kokkos::Experimental::Impl::HIPParallelLaunch<
+                    DriverType>::get_scratch_size() > 0);
+  }
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index 3a76ca148c..73d08abca9 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -60,7 +60,7 @@ __global__ void offset(int* p) {
 // HIP.
 TEST(hip, raw_hip_interop) {
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
 
@@ -70,11 +70,12 @@ TEST(hip, raw_hip_interop) {
   Kokkos::finalize();
 
   offset<<<dim3(100), dim3(100), 0, nullptr>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
 
   std::array<int, 100> h_p;
-  HIP_SAFE_CALL(hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p.data(), p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
@@ -83,6 +84,6 @@ TEST(hip, raw_hip_interop) {
   }
 
   ASSERT_EQ(sum, sum_expect);
-  HIP_SAFE_CALL(hipFree(p));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(p));
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 8e0880ddbd..69ca62df6a 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -51,11 +51,11 @@ namespace Test {
 // bound in HIP due to an error when computing the block size.
 TEST(hip, raw_hip_streams) {
   hipStream_t stream;
-  HIP_SAFE_CALL(hipStreamCreate(&stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
   int* p;
-  HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
   {
@@ -97,12 +97,13 @@ TEST(hip, raw_hip_streams) {
   }
   Kokkos::finalize();
   offset_streams<<<100, 64, 0, stream>>>(p);
-  HIP_SAFE_CALL(hipDeviceSynchronize());
-  HIP_SAFE_CALL(hipStreamDestroy(stream));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream));
 
   int h_p[100];
-  HIP_SAFE_CALL(hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
-  HIP_SAFE_CALL(hipDeviceSynchronize());
+  KOKKOS_IMPL_HIP_SAFE_CALL(
+      hipMemcpy(h_p, p, sizeof(int) * 100, hipMemcpyDefault));
+  KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize());
   int64_t sum        = 0;
   int64_t sum_expect = 0;
   for (int i = 0; i < 100; i++) {
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
index ae1de8ea2d..d20ea877ec 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_Spaces.cpp
@@ -129,27 +129,26 @@ TEST(hip, space_access) {
 
   //--------------------------------------
 
-  static_assert(
-      !Kokkos::Impl::SpaceAccessibility<Kokkos::Experimental::HIP,
-                                        Kokkos::HostSpace>::accessible,
-      "");
-
-  static_assert(Kokkos::Impl::SpaceAccessibility<
-                    Kokkos::Experimental::HIP,
-                    Kokkos::Experimental::HIPSpace>::accessible,
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                            Kokkos::HostSpace>::accessible,
                 "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(
+      Kokkos::SpaceAccessibility<Kokkos::Experimental::HIP,
+                                 Kokkos::Experimental::HIPSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Experimental::HIP,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
 
   static_assert(
-      !Kokkos::Impl::SpaceAccessibility<
-          Kokkos::HostSpace, Kokkos::Experimental::HIPSpace>::accessible,
+      !Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                  Kokkos::Experimental::HIPSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::HostSpace,
                     Kokkos::Experimental::HIPHostPinnedSpace>::accessible,
                 "");
@@ -166,18 +165,18 @@ TEST(hip, space_access) {
                    Kokkos::Experimental::HIPHostPinnedSpace>::value,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<Kokkos::Experimental::HIP>::Space,
                     Kokkos::HostSpace>::accessible,
                 "");
 
   static_assert(
-      Kokkos::Impl::SpaceAccessibility<
+      Kokkos::SpaceAccessibility<
           Kokkos::Impl::HostMirror<Kokkos::Experimental::HIPSpace>::Space,
           Kokkos::HostSpace>::accessible,
       "");
 
-  static_assert(Kokkos::Impl::SpaceAccessibility<
+  static_assert(Kokkos::SpaceAccessibility<
                     Kokkos::Impl::HostMirror<
                         Kokkos::Experimental::HIPHostPinnedSpace>::Space,
                     Kokkos::HostSpace>::accessible,
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
index db360a99d3..86b2fab3c7 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_TeamScratchStreams.cpp
@@ -104,7 +104,7 @@ void hip_stream_scratch_test(
   hipStream_t stream[4];
   Kokkos::Experimental::HIP hip[4];
   for (int i = 0; i < K; i++) {
-    HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream[i]));
     hip[i] = Kokkos::Experimental::HIP(stream[i]);
   }
 // Test that growing scratch size in subsequent calls doesn't crash things
@@ -131,7 +131,7 @@ void hip_stream_scratch_test(
   Kokkos::fence();
   for (int i = 0; i < K; i++) {
     hip[i] = Kokkos::Experimental::HIP();
-    HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
+    KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream[i]));
   }
 }
 }  // namespace Impl
diff --git a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
index 419486d7a8..4d5ca46ba6 100644
--- a/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test01_execspace.hpp
@@ -88,7 +88,7 @@ struct TestIncrExecSpace {
     ExecSpace().fence();
 
     auto concurrency = ExecSpace().concurrency();
-    ASSERT_TRUE(concurrency > 0);
+    ASSERT_GT(concurrency, 0);
 
     int in_parallel = ExecSpace::in_parallel();
     ASSERT_FALSE(in_parallel);
@@ -107,5 +107,7 @@ TEST(TEST_CATEGORY, IncrTest_01_execspace) {
   ASSERT_TRUE(Kokkos::is_execution_space<TEST_EXECSPACE>::value);
   ASSERT_FALSE(Kokkos::is_execution_space<
                TestIncrExecSpaceTypedef<TEST_EXECSPACE>>::value);
+  TestIncrExecSpace<TEST_EXECSPACE> test;
+  test.testit();
 }
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp b/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
index ff4fb6a89f..d40cb4dbe7 100644
--- a/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test02_atomic_host.hpp
@@ -78,7 +78,7 @@ struct TestIncrAtomic {
   }
 };
 
-TEST(TEST_CATEGORY, IncrTest_01_AtomicExchange) {
+TEST(TEST_CATEGORY, IncrTest_02_AtomicExchange) {
   TestIncrAtomic test;
   test.testExchange();
 }
diff --git a/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp b/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
index 4adf9e058f..4192d4abe8 100644
--- a/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test06_ParallelFor_MDRangePolicy.hpp
@@ -94,16 +94,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      typename Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // Device and Host Data structure pointer
   value_type *deviceData, *hostData;
diff --git a/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp b/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
index 5166f5a9f0..6e8fc07b8d 100644
--- a/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test08_deep_copy.hpp
@@ -61,17 +61,17 @@ const int M      = 10;
 template <class ExecSpace>
 struct TestMDRangePolicy {
   // 2D View
-  using View_2D      = typename Kokkos::View<value_type **, ExecSpace>;
+  using View_2D      = Kokkos::View<value_type **, ExecSpace>;
   using Host_View_2D = typename View_2D::HostMirror;
   Host_View_2D hostDataView_2D;
 
   // 3D View
-  using View_3D      = typename Kokkos::View<value_type ***, ExecSpace>;
+  using View_3D      = Kokkos::View<value_type ***, ExecSpace>;
   using Host_View_3D = typename View_3D::HostMirror;
   Host_View_3D hostDataView_3D;
 
   // 4D View
-  using View_4D      = typename Kokkos::View<value_type ****, ExecSpace>;
+  using View_4D      = Kokkos::View<value_type ****, ExecSpace>;
   using Host_View_4D = typename View_4D::HostMirror;
   Host_View_4D hostDataView_4D;
 
@@ -83,16 +83,16 @@ struct TestMDRangePolicy {
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   // An MDRangePolicy for 3 nested loops
-  using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<3>, int_index>;
+  using MDPolicyType_3D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<3>, int_index>;
 
   // An MDRangePolicy for 4 nested loops
-  using MDPolicyType_4D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<4>, int_index>;
+  using MDPolicyType_4D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<4>, int_index>;
 
   // compare and equal
   void compare_equal_2D() {
diff --git a/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp b/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
index 5bf1860d8e..ab1cd90d4b 100644
--- a/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test12a_ThreadScratch.hpp
@@ -74,9 +74,15 @@ struct ThreadScratch {
     for (int i = 0; i < sY; ++i) v_S(i) = 0;
 
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+    // FIXME_SYCL This deadlocks in the subgroup_barrier when running on CUDA
+    // devices.
+#ifdef KOKKOS_ENABLE_SYCL
+      for (int k = 0; k < sY; ++k) v_S(k) += sX * sY * n + sY * m + k;
+#else
       Kokkos::parallel_for(
           Kokkos::ThreadVectorRange(team, sY),
           [&](const int k) { v_S(k) += sX * sY * n + sY * m + k; });
+#endif
     });
 
     team.team_barrier();
@@ -93,7 +99,7 @@ struct ThreadScratch {
     int scratchSize = scratch_t::shmem_size(sY);
     // So this works with deprecated code enabled:
     policy_t policy =
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerThread(scratchSize));
 
     int max_team_size = policy.team_size_max(*this, Kokkos::ParallelForTag());
diff --git a/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp b/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
index b34f652e76..d81822d0da 100644
--- a/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test12b_TeamScratch.hpp
@@ -68,7 +68,7 @@ struct TeamScratch {
 
     Kokkos::parallel_for(
         "Team",
-        policy_t(pN, Kokkos::AUTO)
+        policy_t(pN, Kokkos::AUTO, 1)
             .set_scratch_size(scratch_level, Kokkos::PerTeam(scratchSize)),
         KOKKOS_LAMBDA(const team_t &team) {
           // Allocate and use scratch pad memory
@@ -77,11 +77,20 @@ struct TeamScratch {
 
           Kokkos::parallel_for(
               Kokkos::TeamThreadRange(team, sX), [&](const int m) {
+      // FIXME_SYCL This deadlocks in the subgroup_barrier
+      // when running on CUDA devices.
+#ifdef KOKKOS_ENABLE_SYCL
+                for (int k = 0; k < sY; ++k) {
+                  v_S(m, k) =
+                      v_S.extent(0) * v_S.extent(1) * n + v_S.extent(1) * m + k;
+                }
+#else
                 Kokkos::parallel_for(
                     Kokkos::ThreadVectorRange(team, sY), [&](const int k) {
                       v_S(m, k) = v_S.extent(0) * v_S.extent(1) * n +
                                   v_S.extent(1) * m + k;
                     });
+#endif
               });
 
           team.team_barrier();
diff --git a/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp b/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
index d227e834dc..7d53b9fb20 100644
--- a/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
+++ b/lib/kokkos/core/unit_test/incremental/Test14_MDRangeReduce.hpp
@@ -82,20 +82,20 @@ struct MyComplex {
 template <class ExecSpace>
 struct TestMDRangeReduce {
   // 1D  View of double
-  using View_1D = typename Kokkos::View<value_type*, ExecSpace>;
+  using View_1D = Kokkos::View<value_type*, ExecSpace>;
 
   // 2D  View of double
-  using View_2D = typename Kokkos::View<value_type**, ExecSpace>;
+  using View_2D = Kokkos::View<value_type**, ExecSpace>;
 
   // Index Type for the iterator
   using int_index = Kokkos::IndexType<int>;
 
   // An MDRangePolicy for 2 nested loops
-  using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-      ExecSpace, Kokkos::Experimental::Rank<2>, int_index>;
+  using MDPolicyType_2D =
+      Kokkos::MDRangePolicy<ExecSpace, Kokkos::Rank<2>, int_index>;
 
   //  1D - complex View
-  using Complex_View_1D = typename Kokkos::View<MyComplex*, ExecSpace>;
+  using Complex_View_1D = Kokkos::View<MyComplex*, ExecSpace>;
 
   // Reduction when ExecPolicy = MDRangePolicy and ReducerArgument =
   // scalar/1-element view
@@ -176,7 +176,11 @@ struct TestMDRangeReduce {
 TEST(TEST_CATEGORY, incr_14_MDrangeReduce) {
   TestMDRangeReduce<TEST_EXECSPACE> test;
   test.reduce_MDRange();
+// FIXME_OPENMPTARGET: custom reductions are not yet supported in the
+// OpenMPTarget backend.
+#if !defined(KOKKOS_ENABLE_OPENMPTARGET)
   test.reduce_custom();
+#endif
 }
 
 }  // namespace Test
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index 018855963d..d145d69d9e 100644
--- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,13 +52,16 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
-  sycl::default_selector device_selector;
-  sycl::queue queue(device_selector);
-  constexpr int n = 100;
-  int* p          = sycl::malloc_device<int>(n, queue);
-
   Kokkos::InitArguments arguments{-1, -1, -1, false};
   Kokkos::initialize(arguments);
+
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+  constexpr int n = 100;
+  int* p          = sycl::malloc_device<int>(n, queue);
   {
     TEST_EXECSPACE space(queue);
     Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, n);
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
new file mode 100644
index 0000000000..91fdaac6e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_Spaces.cpp
@@ -0,0 +1,356 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <TestSYCL_Category.hpp>
+
+namespace Test {
+
+TEST(sycl, space_access) {
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::HostSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLDeviceUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLSharedUSMSpace,
+                                       Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLSharedUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::assignable,
+                "");
+
+  static_assert(
+      !Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                       Kokkos::HostSpace>::assignable,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::assignable,
+                "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::Experimental::SYCLHostUSMSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  //--------------------------------------
+
+  static_assert(!Kokkos::SpaceAccessibility<Kokkos::Experimental::SYCL,
+                                            Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Experimental::SYCL,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(!Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLDeviceUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLSharedUSMSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(
+      std::is_same<Kokkos::Impl::HostMirror<
+                       Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                   Kokkos::HostSpace>::value,
+      "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Impl::HostMirror<
+              Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(
+      Kokkos::Impl::MemorySpaceAccess<Kokkos::Experimental::SYCLHostUSMSpace,
+                                      Kokkos::HostSpace>::accessible,
+      "");
+
+  static_assert(Kokkos::Impl::MemorySpaceAccess<
+                    Kokkos::HostSpace,
+                    Kokkos::Experimental::SYCLHostUSMSpace>::accessible,
+                "");
+
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<
+                                 Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                             Kokkos::Experimental::SYCLHostUSMSpace>::value,
+                "");
+
+  static_assert(
+      std::is_same<
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>,
+          Kokkos::Device<Kokkos::HostSpace::execution_space,
+                         Kokkos::Experimental::SYCLSharedUSMSpace>>::value,
+      "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<Kokkos::Experimental::SYCL>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLDeviceUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLSharedUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+
+  static_assert(Kokkos::SpaceAccessibility<
+                    Kokkos::Impl::HostMirror<
+                        Kokkos::Experimental::SYCLHostUSMSpace>::Space,
+                    Kokkos::HostSpace>::accessible,
+                "");
+}
+
+TEST(sycl, uvm) {
+  int *uvm_ptr = static_cast<int *>(
+      Kokkos::kokkos_malloc<Kokkos::Experimental::SYCLSharedUSMSpace>(
+          "uvm_ptr", sizeof(int)));
+
+  *uvm_ptr = 42;
+
+  Kokkos::Experimental::SYCL().fence();
+  Kokkos::parallel_for(
+      Kokkos::RangePolicy<Kokkos::Experimental::SYCL>(0, 1),
+      KOKKOS_LAMBDA(int) {
+        if (*uvm_ptr == 42) {
+          *uvm_ptr = 2 * 42;
+        }
+      });
+  Kokkos::Experimental::SYCL().fence();
+
+  EXPECT_EQ(*uvm_ptr, int(2 * 42));
+
+  Kokkos::kokkos_free<Kokkos::Experimental::SYCLSharedUSMSpace>(uvm_ptr);
+}
+
+template <class MemSpace, class ExecSpace>
+struct TestViewSYCLAccessible {
+  enum { N = 1000 };
+
+  using V = Kokkos::View<double *, MemSpace>;
+
+  V m_base;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagInit &, const int i) const { m_base[i] = i + 1; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TagTest &, const int i, long &error_count) const {
+    if (m_base[i] != i + 1) ++error_count;
+  }
+
+  TestViewSYCLAccessible() : m_base("base", N) {}
+
+  static void run() {
+    TestViewSYCLAccessible self;
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<typename MemSpace::execution_space, TagInit>(0, N),
+        self);
+    typename MemSpace::execution_space().fence();
+
+    // Next access is a different execution space, must complete prior kernel.
+    long error_count = -1;
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecSpace, TagTest>(0, N), self,
+                            error_count);
+    EXPECT_EQ(error_count, 0);
+  }
+};
+
+TEST(sycl, impl_view_accessible) {
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLDeviceUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLSharedUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::Experimental::SYCL>::run();
+  TestViewSYCLAccessible<Kokkos::Experimental::SYCLHostUSMSpace,
+                         Kokkos::HostSpace::execution_space>::run();
+}
+
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
similarity index 96%
rename from lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
rename to lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
index 6602d7396a..95a7b68088 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaUVM_SharedAlloc.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp
@@ -1,3 +1,4 @@
+
 /*
 //@HEADER
 // ************************************************************************
@@ -42,5 +43,5 @@
 //@HEADER
 */
 
-#include <TestCudaUVM_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <TestSYCL_Category.hpp>
+#include <TestTaskScheduler.hpp>
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
new file mode 100644
index 0000000000..ab0d09880f
--- /dev/null
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_TeamScratchStreams.cpp
@@ -0,0 +1,154 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <TestSYCL_Category.hpp>
+#include <Kokkos_Core.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+struct SYCLQueueScratchTestFunctor {
+  using team_t = Kokkos::TeamPolicy<Kokkos::Experimental::SYCL>::member_type;
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace,
+               Kokkos::MemoryTraits<Kokkos::Atomic>>
+      counter;
+  int N, M;
+  SYCLQueueScratchTestFunctor(
+      Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter_,
+      int N_, int M_)
+      : counter(counter_), N(N_), M(M_) {}
+
+  KOKKOS_FUNCTION
+  void operator()(const team_t& team) const {
+    scratch_t scr(team.team_scratch(1), M);
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                         [&](int i) { scr[i] = 0; });
+    team.team_barrier();
+    for (int i = 0; i < N; i++) {
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M),
+                           [&](int j) { scr[j] += 1; });
+    }
+    team.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, M), [&](int i) {
+      if (scr[i] != N) counter()++;
+    });
+  }
+};
+
+void sycl_queue_scratch_test_one(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter,
+    Kokkos::Experimental::SYCL sycl, int tid) {
+  int M = M_base + tid * 5;
+  Kokkos::TeamPolicy<Kokkos::Experimental::SYCL> p(sycl, T, 64);
+  using scratch_t =
+      Kokkos::View<int64_t*, Kokkos::Experimental::SYCL::scratch_memory_space>;
+
+  int bytes = scratch_t::shmem_size(M);
+
+  for (int r = 0; r < 15; r++) {
+    Kokkos::parallel_for("Run", p.set_scratch_size(1, Kokkos::PerTeam(bytes)),
+                         SYCLQueueScratchTestFunctor(counter, N, M));
+  }
+}
+
+void sycl_queue_scratch_test(
+    int N, int T, int M_base,
+    Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter) {
+  constexpr int K = 4;
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
+
+  std::array<Kokkos::Experimental::SYCL, K> sycl;
+  for (int i = 0; i < K; i++) {
+    sycl[i] = Kokkos::Experimental::SYCL(
+        sycl::queue(default_context, device_selector));
+  }
+
+  // Test that growing scratch size in subsequent calls doesn't crash things
+#if defined(KOKKOS_ENABLE_OPENMP)
+#pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    // Limit how many threads submit
+    if (tid < 4) {
+      sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+    }
+  }
+#else
+  for (int tid = 0; tid < K; tid++) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+#endif
+  // Test that if everything is large enough, multiple launches with different
+  // scratch sizes don't step on each other
+  for (int tid = K - 1; tid >= 0; tid--) {
+    sycl_queue_scratch_test_one(N, T, M_base, counter, sycl[tid], tid);
+  }
+
+  Kokkos::fence();
+}
+}  // namespace Impl
+
+TEST(sycl, team_scratch_1_queues) {
+  int N      = 1000000;
+  int T      = 10;
+  int M_base = 150;
+
+  Kokkos::View<int64_t, Kokkos::Experimental::SYCLDeviceUSMSpace> counter("C");
+
+  Impl::sycl_queue_scratch_test(N, T, M_base, counter);
+
+  int64_t result;
+  Kokkos::deep_copy(result, counter);
+  ASSERT_EQ(0, result);
+}
+}  // namespace Test
diff --git a/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp b/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
new file mode 100644
index 0000000000..870621c1e0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestBuiltinTuners.cpp
@@ -0,0 +1,123 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+using ExecSpace  = Kokkos::DefaultHostExecutionSpace;
+using TeamMember = Kokkos::TeamPolicy<ExecSpace>::member_type;
+struct TestTeamFunctor {
+  KOKKOS_FUNCTION void operator()(TeamMember) const {}
+};
+struct TestMDFunctor {
+  KOKKOS_FUNCTION void operator()(const int, const int) const {}
+};
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    Kokkos::TeamPolicy<ExecSpace> teamp(1, Kokkos::AUTO, Kokkos::AUTO);
+    Kokkos::MDRangePolicy<Kokkos::Rank<2>> mdp({0, 0}, {1, 1});
+    Kokkos::Tools::Experimental::TeamSizeTuner team_tune_this(
+        "team_tuner", teamp, TestTeamFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    Kokkos::Tools::Experimental::MDRangeTuner<2> md_tune_this(
+        "md_tuner", mdp, TestMDFunctor{}, Kokkos::ParallelForTag{},
+        Kokkos::Tools::Impl::Impl::SimpleTeamSizeCalculator{});
+
+    std::vector<int> options{1, 2, 3, 4, 5};
+
+    auto new_team_tuner = team_tune_this.combine("options", options);
+    auto new_md_tuner   = md_tune_this.combine("options", options);
+    using namespace Kokkos::Tools::Experimental;
+    VariableInfo info;
+    info.category      = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity = CandidateValueType::kokkos_value_unbounded;
+    info.type          = ValueType::kokkos_value_string;
+    size_t input       = declare_input_type("kernel", info);
+    VariableValue team_kernel_value = make_variable_value(input, "abs");
+    VariableValue md_kernel_value   = make_variable_value(input, "abs");
+    size_t kernel_context           = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &team_kernel_value);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_md_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int tile_x = std::get<1>(config);
+      int tile_y = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::MDRangePolicy<Kokkos::Rank<2>>(
+                               {0, 0}, {1, 1}, {tile_x, tile_y}),
+                           TestMDFunctor{});
+      new_md_tuner.end();
+    }
+    end_context(kernel_context);
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &md_kernel_value);
+
+    /**
+     * Note that 0.0 is basically a floating point index into
+     * the outermost index in this, which is the options vector
+     * above. The At 0.0, this will be the first element (1).
+     * At 0.9 this will be the last element (5)
+     */
+    auto begin_point = new_team_tuner.get_point(0.0, 0.0, 0.0);
+    assert(std::get<0>(begin_point) == 1);
+    (void)begin_point;  // to avoid warnings in some compilers
+    auto end_point = new_team_tuner.get_point(0.9, 0.0, 0.0);
+    (void)end_point;  // to avoid warnings in some compilers
+    assert(std::get<0>(end_point) == 5);
+    for (int x = 0; x < 10000; ++x) {
+      auto config = new_team_tuner.begin();
+      int option  = std::get<0>(config);
+      (void)option;
+      int team   = std::get<1>(config);
+      int vector = std::get<2>(config);
+      Kokkos::parallel_for("mdrange",
+                           Kokkos::TeamPolicy<ExecSpace>(1, team, vector),
+                           TestTeamFunctor{});
+      new_team_tuner.end();
+    }
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp b/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
new file mode 100644
index 0000000000..2177556d39
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestCategoricalTuner.cpp
@@ -0,0 +1,86 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+// This file tests the categorical tuner
+
+#include <Kokkos_Core.hpp>
+#include <unistd.h>
+struct point {
+  float x;
+  float y;
+  float z;
+};
+void do_computation(const point& test_point) {
+  usleep(((unsigned int)test_point.x) * 100);
+}
+using namespace Kokkos::Tools::Experimental;
+int main(int argc, char* argv[]) {
+  Kokkos::initialize(argc, argv);
+  {
+    VariableInfo info;
+    info.category              = StatisticalCategory::kokkos_value_categorical;
+    info.valueQuantity         = CandidateValueType::kokkos_value_unbounded;
+    info.type                  = ValueType::kokkos_value_string;
+    size_t input               = declare_input_type("kernel", info);
+    VariableValue kernel_value = make_variable_value(input, "abs");
+    size_t kernel_context      = get_new_context_id();
+    begin_context(kernel_context);
+    set_input_values(kernel_context, 1, &kernel_value);
+
+    std::vector<point> points;
+    points.push_back({1.0, 1.0, 1.0});
+    points.push_back({10.0, 10.0, 10.0});
+    points.push_back({0.0, 0.0, 0.0});
+    auto tuner =
+        Kokkos::Tools::Experimental::make_categorical_tuner("points", points);
+    for (decltype(points)::size_type x = 0; x < 3000; ++x) {
+      point test_point = tuner.begin();
+      do_computation(test_point);
+      tuner.end();
+    }
+
+    end_context(kernel_context);
+  }
+  Kokkos::finalize();
+}
diff --git a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
similarity index 94%
rename from lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
rename to lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
index 4228b5181a..ac0b4d2619 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCudaHostPinned_SharedAlloc.cpp
+++ b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.cpp
@@ -42,5 +42,8 @@
 //@HEADER
 */
 
-#include <TestCudaHostPinned_Category.hpp>
-#include <TestSharedAlloc.hpp>
+#include <iostream>
+#include "Kokkos_Core.hpp"
+
+#include <tools/TestEventCorrectness.hpp>
+#include "../UnitTestMainInit.cpp"
diff --git a/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
new file mode 100644
index 0000000000..430677a335
--- /dev/null
+++ b/lib/kokkos/core/unit_test/tools/TestEventCorrectness.hpp
@@ -0,0 +1,284 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#include <iostream>
+#include <gtest/gtest.h>
+#include "Kokkos_Core.hpp"
+
+#include <impl/Kokkos_Stacktrace.hpp>
+#include <vector>
+#include <algorithm>
+namespace Kokkos {
+class Serial;
+class OpenMP;
+class Cuda;
+class Threads;
+namespace Experimental {
+class SYCL;
+class HIP;
+class OpenMPTarget;
+class HPX;
+}  // namespace Experimental
+}  // namespace Kokkos
+namespace Test {
+struct FencePayload {
+  std::string name;
+  enum distinguishable_devices { yes, no };
+  distinguishable_devices distinguishable;
+  uint32_t dev_id;
+};
+
+std::vector<FencePayload> found_payloads;
+template <typename Lambda>
+void expect_fence_events(std::vector<FencePayload>& expected, Lambda lam) {
+  found_payloads = {};
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char* name, const uint32_t dev_id, uint64_t*) {
+        found_payloads.push_back(
+            FencePayload{std::string(name),
+                         FencePayload::distinguishable_devices::no, dev_id});
+      });
+  lam();
+  for (auto& entry : expected) {
+    std::cout << "Ref: " << entry.dev_id << std::endl;
+    std::cout << "Ref: " << entry.name << std::endl;
+    auto search = std::find_if(
+        found_payloads.begin(), found_payloads.end(),
+        [&](const auto& found_entry) {
+          auto name_match =
+              (found_entry.name.find(entry.name) != std::string::npos);
+          auto id_match = (entry.dev_id == found_entry.dev_id);
+          std::cout << found_entry.dev_id << std::endl;
+          std::cout << found_entry.name << std::endl;
+          if (!name_match) {
+            std::cout << "Miss on name\n";
+          }
+          if (!id_match) {
+            std::cout << "Miss on id\n";
+          }
+          return (name_match && id_match);
+        });
+    auto found = (search != found_payloads.end());
+    ASSERT_TRUE(found);
+  }
+  Kokkos::Tools::Experimental::set_begin_fence_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+  Kokkos::Tools::Experimental::set_begin_parallel_for_callback(
+      [](const char*, const uint32_t, uint64_t*) {});
+}
+
+template <class>
+struct increment {
+  constexpr static const int size = 0;
+};
+int num_instances = 1;
+struct TestFunctor {
+  KOKKOS_FUNCTION void operator()(const int) const {}
+};
+template <typename Lambda>
+void test_wrapper(const Lambda& lambda) {
+  if (!std::is_same<Kokkos::DefaultExecutionSpace, Kokkos::Serial>::value) {
+    lambda();
+  }
+}
+/**
+ * Test that fencing an instance with a name yields a fence
+ * event of that name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_named_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence("named_instance");
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that fencing an instance without a name yields a fence
+ * event of a correct name, and the correct device ID
+ */
+TEST(defaultdevicetype, test_unnamed_instance_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Instance Fence", FencePayload::distinguishable_devices::no,
+         root + num_instances}};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      ex.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+
+/**
+ * Test that invoking a global fence with a name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_named_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"test global fence", FencePayload::distinguishable_devices::no, root}};
+    expect_fence_events(expected,
+                        [=]() { Kokkos::fence("test global fence"); });
+  });
+}
+
+/**
+ * Test that invoking a global fence with no name yields a fence
+ * event of a correct name, and fences the root of the default device
+ */
+TEST(defaultdevicetype, test_unnamed_global_fence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+
+    std::vector<FencePayload> expected{
+
+        {"Unnamed Global Fence", FencePayload::distinguishable_devices::no,
+         root}};
+    expect_fence_events(expected, [=]() { Kokkos::fence(); });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+/**
+ * Test that creating two default instances and fencing both yields
+ * fence on the same device ID, as these should yield the same instance
+ */
+TEST(defaultdevicetype, test_multiple_default_instances) {
+  test_wrapper([&]() {
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex1;
+      Kokkos::DefaultExecutionSpace ex2;
+      ex1.fence("named_instance_fence_one");
+      ex2.fence("named_instance_fence_two");
+    });
+    ASSERT_TRUE(found_payloads[0].dev_id == found_payloads[1].dev_id);
+  });
+}
+
+/**
+ * Test that fencing and kernels yield events on the correct device ID's
+ */
+TEST(defaultdevicetype, test_kernel_sequence) {
+  test_wrapper([&]() {
+    auto root = Kokkos::Tools::Experimental::device_id_root<
+        Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{
+
+        {"named_instance", FencePayload::distinguishable_devices::no,
+         root + num_instances},
+        {"test_kernel", FencePayload::distinguishable_devices::no,
+         root + num_instances}
+
+    };
+    expect_fence_events(expected, [=]() {
+      Kokkos::DefaultExecutionSpace ex;
+      TestFunctor tf;
+      ex.fence("named_instance");
+      Kokkos::parallel_for(
+          "test_kernel",
+          Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(ex, 0, 1), tf);
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+  });
+}
+#ifdef KOKKOS_ENABLE_CUDA
+/**
+ * CUDA ONLY: test that creating instances from streams leads to events
+ * on different device ID's
+ */
+TEST(defaultdevicetype, test_streams) {
+  test_wrapper([&]() {
+    // auto root = Kokkos::Tools::Experimental::device_id_root<
+    //    Kokkos::DefaultExecutionSpace>();
+    std::vector<FencePayload> expected{};
+    expect_fence_events(expected, [=]() {
+      cudaStream_t s1, s2;
+      cudaStreamCreate(&s1);
+      cudaStreamCreate(&s2);
+      Kokkos::Cuda default_space;
+      Kokkos::Cuda space_s1(s1);
+      Kokkos::Cuda space_s2(s2);
+      default_space.fence();
+      space_s1.fence();
+      space_s2.fence();
+    });
+    num_instances += increment<Kokkos::DefaultExecutionSpace>::size;
+    found_payloads.erase(
+        std::remove_if(found_payloads.begin(), found_payloads.end(),
+                       [&](const auto& entry) {
+                         return (
+                             entry.name.find("Fence on space initialization") !=
+                             std::string::npos);
+                       }),
+        found_payloads.end());
+    ASSERT_TRUE(found_payloads[0].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[1].dev_id);
+    ASSERT_TRUE(found_payloads[2].dev_id != found_payloads[0].dev_id);
+  });
+}
+
+#endif
+
+}  // namespace Test
diff --git a/lib/kokkos/example/query_device/query_device.cpp b/lib/kokkos/example/query_device/query_device.cpp
index a563b06b28..9c4e9a8c83 100644
--- a/lib/kokkos/example/query_device/query_device.cpp
+++ b/lib/kokkos/example/query_device/query_device.cpp
@@ -47,7 +47,8 @@
 
 #include <Kokkos_Macros.hpp>
 
-#if defined(KOKKOS_ENABLE_MPI)
+//#define USE_MPI
+#if defined(USE_MPI)
 #include <mpi.h>
 #endif
 
@@ -61,7 +62,7 @@ int main(int argc, char** argv) {
 
   (void)argc;
   (void)argv;
-#if defined(KOKKOS_ENABLE_MPI)
+#if defined(USE_MPI)
 
   MPI_Init(&argc, &argv);
 
@@ -72,7 +73,7 @@ int main(int argc, char** argv) {
   msg << "MPI rank(" << mpi_rank << ") ";
 
 #endif
-
+  Kokkos::initialize(argc, argv);
   msg << "{" << std::endl;
 
   if (Kokkos::hwloc::available()) {
@@ -82,15 +83,13 @@ int main(int argc, char** argv) {
         << std::endl;
   }
 
-#if defined(KOKKOS_ENABLE_CUDA)
-  Kokkos::Cuda::print_configuration(msg);
-#endif
+  Kokkos::print_configuration(msg);
 
   msg << "}" << std::endl;
 
   std::cout << msg.str();
-
-#if defined(KOKKOS_ENABLE_MPI)
+  Kokkos::finalize();
+#if defined(USE_MPI)
 
   MPI_Finalize();
 
diff --git a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
index 07b99087d4..5ac7f4fbb0 100644
--- a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
+++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp
@@ -107,8 +107,8 @@ int main(int argc, char* argv[]) {
 
   // ViewType aliases for Rank<2>, Rank<3> for example usage
   using ScalarType  = double;
-  using ViewType_2D = typename Kokkos::View<ScalarType**>;
-  using ViewType_3D = typename Kokkos::View<ScalarType***>;
+  using ViewType_2D = Kokkos::View<ScalarType**>;
+  using ViewType_3D = Kokkos::View<ScalarType***>;
 
   /////////////////////////////////////////////////////////////////////////////
   // Explanation of MDRangePolicy usage, template parameters, constructor
@@ -160,8 +160,7 @@ int main(int argc, char* argv[]) {
   long incorrect_count_2d = 0;
   {
     // Rank<2> Case: Rank is provided, all other parameters are default
-    using MDPolicyType_2D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<2> >;
+    using MDPolicyType_2D = Kokkos::MDRangePolicy<Kokkos::Rank<2> >;
 
     // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims
     // defaulted
@@ -185,9 +184,8 @@ int main(int argc, char* argv[]) {
   long incorrect_count_3d = 0;
   {
     // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
-    using MDPolicyType_3D = typename Kokkos::Experimental::MDRangePolicy<
-        Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left,
-                                   Kokkos::Experimental::Iterate::Left> >;
+    using MDPolicyType_3D = Kokkos::MDRangePolicy<
+        Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left> >;
 
     // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
     MDPolicyType_3D mdpolicy_3d({{0, 0, 0}}, {{n, n, n}}, {{4, 4, 4}});
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
index 597d1e3056..75eca5403f 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/data_layouts.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 // These two View types are both 2-D arrays of double.  However, they
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
index 00bfeea36b..0544e572e7 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/memory_traits.cpp
@@ -43,7 +43,7 @@
 */
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
index 20e5c5a284..52af4bd3b5 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/subviews.cpp
@@ -49,7 +49,7 @@
 // the mesh.
 
 #include <Kokkos_Core.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 
 using mesh_type = Kokkos::View<double***, Kokkos::LayoutRight>;
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
index 3c0fcd085c..622b24b931 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/dual_view.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
index a906ba1447..596b25aaad 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
index c582fa1704..c03515479d 100644
--- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
+++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/overlapping_deepcopy.cpp
@@ -46,7 +46,7 @@
 #include <cstdio>
 #include <typeinfo>
 #include <cmath>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 
 struct FillDevice {
   double value;
diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
index 9c5f2d62fc..602122b61f 100644
--- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
+++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/random_numbers.cpp
@@ -45,7 +45,7 @@
 #include <Kokkos_Core.hpp>
 #include <Kokkos_Random.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdlib>
 
 using DefaultHostType = Kokkos::HostSpace::execution_space;
@@ -74,7 +74,7 @@ using DefaultHostType = Kokkos::HostSpace::execution_space;
 template <class GeneratorPool>
 struct generate_random {
   // Output View for the random numbers
-  Kokkos::View<uint64_t*> vals;
+  Kokkos::View<uint64_t**> vals;
 
   // The GeneratorPool
   GeneratorPool rand_pool;
@@ -82,7 +82,7 @@ struct generate_random {
   int samples;
 
   // Initialize all members
-  generate_random(Kokkos::View<uint64_t*> vals_, GeneratorPool rand_pool_,
+  generate_random(Kokkos::View<uint64_t**> vals_, GeneratorPool rand_pool_,
                   int samples_)
       : vals(vals_), rand_pool(rand_pool_), samples(samples_) {}
 
@@ -94,8 +94,7 @@ struct generate_random {
     // Draw samples numbers from the pool as urand64 between 0 and
     // rand_pool.MAX_URAND64 Note there are function calls to get other type of
     // scalars, and also to specify Ranges or get a normal distributed float.
-    for (int k = 0; k < samples; k++)
-      vals(i * samples + k) = rand_gen.urand64();
+    for (int k = 0; k < samples; k++) vals(i, k) = rand_gen.urand64();
 
     // Give the state back, which will allow another thread to acquire it
     rand_pool.free_state(rand_gen);
@@ -103,11 +102,11 @@ struct generate_random {
 };
 
 int main(int argc, char* args[]) {
+  Kokkos::initialize(argc, args);
   if (argc != 3) {
     printf("Please pass two integers on the command line\n");
   } else {
     // Initialize Kokkos
-    Kokkos::initialize(argc, args);
     int size    = std::stoi(args[1]);
     int samples = std::stoi(args[2]);
 
@@ -117,7 +116,7 @@ int main(int argc, char* args[]) {
     // pool.
     Kokkos::Random_XorShift64_Pool<> rand_pool64(5374857);
     Kokkos::Random_XorShift1024_Pool<> rand_pool1024(5374857);
-    Kokkos::DualView<uint64_t*> vals("Vals", size * samples);
+    Kokkos::DualView<uint64_t**> vals("Vals", size, samples);
 
     // Run some performance comparisons
     Kokkos::Timer timer;
@@ -151,8 +150,7 @@ int main(int argc, char* args[]) {
            1.0e-9 * samples * size / time_1024);
 
     Kokkos::deep_copy(vals.h_view, vals.d_view);
-
-    Kokkos::finalize();
   }
+  Kokkos::finalize();
   return 0;
 }
diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
index d360108925..cc20a497b2 100644
--- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
+++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/team_scan.cpp
@@ -44,7 +44,7 @@
 
 #include <Kokkos_Core.hpp>
 #include <Kokkos_DualView.hpp>
-#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Timer.hpp>
 #include <cstdio>
 #include <cstdlib>
 
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
index c601e0ee16..5e33f59218 100755
--- a/lib/kokkos/generate_makefile.bash
+++ b/lib/kokkos/generate_makefile.bash
@@ -162,6 +162,7 @@ display_help_text() {
       echo "                 VEGA900         = AMD GPU MI25 GFX900"
       echo "                 VEGA906         = AMD GPU MI50/MI60 GFX906"
       echo "                 VEGA908         = AMD GPU MI100 GFX908"
+      echo "                 VEGA90A         = "
       echo "               [ARM]"
       echo "                 ARMV80          = ARMv8.0 Compatible CPU"
       echo "                 ARMV81          = ARMv8.1 Compatible CPU"
@@ -478,5 +479,5 @@ if [[ ${COMPILER} == *clang* ]]; then
    fi
 fi
 
-echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${KOKKOS_PATH}
-cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} ${KOKKOS_PATH}
+echo cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
+cmake $COMPILER_CMD  -DCMAKE_CXX_FLAGS="${KOKKOS_CXXFLAGS//\"}" -DCMAKE_EXE_LINKER_FLAGS="${KOKKOS_LDFLAGS//\"}" -DCMAKE_INSTALL_PREFIX=${PREFIX} ${KOKKOS_DEVICE_CMD} ${KOKKOS_ARCH_CMD} -DKokkos_ENABLE_TESTS=${KOKKOS_DO_TESTS} -DKokkos_ENABLE_EXAMPLES=${KOKKOS_DO_EXAMPLES} ${KOKKOS_OPTION_CMD} ${KOKKOS_CUDA_OPTION_CMD} ${KOKKOS_HIP_OPTION_CMD} -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_CXX_EXTENSIONS=OFF ${STANDARD_CMD} ${KOKKOS_DEBUG_CMD} ${KOKKOS_BC_CMD} ${KOKKOS_HWLOC_CMD} ${KOKKOS_HWLOC_PATH_CMD} ${KOKKOS_MEMKIND_CMD} ${KOKKOS_MEMKIND_PATH_CMD} ${PASSTHRU_CMAKE_FLAGS} -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF ${KOKKOS_PATH}
diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt
index be8a5e7da5..0c32469da3 100644
--- a/lib/kokkos/master_history.txt
+++ b/lib/kokkos/master_history.txt
@@ -25,3 +25,4 @@ tag:  3.3.00     date: 12:16:2020    master: 734f577a    release: 1535ba5c
 tag:  3.3.01     date: 01:06:2021    master: 6d65b5a3    release: 4d23839c
 tag:  3.4.00     date: 04:26:2021    master: 1fb0c284    release: 5d7738d6
 tag:  3.4.01     date: 05:20:2021    master: 4b97a22f    release: 410b15c8
+tag:  3.5.00     date: 11:19:2021    master: c28a8b03    release: 21b879e4