diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index bdbc75604b..34df76a0fd 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,4 +1,25 @@
-# Change Log
+# CHANGELOG
+
+## [3.7.02](https://github.com/kokkos/kokkos/tree/3.7.02) (2023-05-17)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...3.7.02)
+
+### Backends and Archs Enhancements:
+#### CUDA
+- Add Hopper support and update nvcc_wrapper to work with CUDA-12 [\#5693](https://github.com/kokkos/kokkos/pull/5693)
+### General Enhancements:
+- sprintf -> snprintf [\#5787](https://github.com/kokkos/kokkos/pull/5787)
+### Build System:
+- Add error message when not using `hipcc` and when `CMAKE_CXX_STANDARD` is not set [\#5945](https://github.com/kokkos/kokkos/pull/5945)
+### Bug Fixes:
+- Fix Scratch allocation alignment issues [\#5692](https://github.com/kokkos/kokkos/pull/5692)
+- Fix Intel Classic Compiler ICE [\#5710](https://github.com/kokkos/kokkos/pull/5710)
+- Don't install std algorithm headers multiple times [\#5711](https://github.com/kokkos/kokkos/pull/5711)
+- Fix static init order issue in InitalizationSettings [\#5721](https://github.com/kokkos/kokkos/pull/5721)
+- Fix src/dst Properties in deep_copy(DynamicView,View) [\#5732](https://github.com/kokkos/kokkos/pull/5732)
+- Fix build on Fedora Rawhide [\#5782](https://github.com/kokkos/kokkos/pull/5782)
+- Finalize HIP lock arrays [\#5694](https://github.com/kokkos/kokkos/pull/5694)
+- Fix CUDA lock arrays for current Desul [\#5812](https://github.com/kokkos/kokkos/pull/5812)
+- Set the correct device/context in InterOp tests [\#5701](https://github.com/kokkos/kokkos/pull/5701)
 
 ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt
index 7b78f29d73..404aad8065 100644
--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@@ -129,7 +129,7 @@ ENDIF()
 
 set(Kokkos_VERSION_MAJOR 3)
 set(Kokkos_VERSION_MINOR 7)
-set(Kokkos_VERSION_PATCH 01)
+set(Kokkos_VERSION_PATCH 02)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
 
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index b873fd1e06..361995d847 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -12,7 +12,7 @@ endif
 
 KOKKOS_VERSION_MAJOR = 3
 KOKKOS_VERSION_MINOR = 7
-KOKKOS_VERSION_PATCH = 01
+KOKKOS_VERSION_PATCH = 02
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
 
 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt
index 597626b111..606d83d18b 100644
--- a/lib/kokkos/algorithms/src/CMakeLists.txt
+++ b/lib/kokkos/algorithms/src/CMakeLists.txt
@@ -25,7 +25,7 @@ INSTALL (
 # These will get ignored for standalone CMake and a true interface library made
 KOKKOS_ADD_INTERFACE_LIBRARY(
   kokkosalgorithms
-  HEADERS ${ALGO_HEADERS}
+  NOINSTALLHEADERS ${ALGO_HEADERS}
   SOURCES ${ALGO_SOURCES}
 )
 KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake
index f20a91e42f..3aff6857a3 100644
--- a/lib/kokkos/cmake/kokkos_arch.cmake
+++ b/lib/kokkos/cmake/kokkos_arch.cmake
@@ -214,6 +214,9 @@ GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
 IF(KOKKOS_ENABLE_HIP)
   SET(AMDGPU_ARCH_FLAG "--offload-arch")
   IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC)
+    IF(KOKKOS_CXX_STANDARD STREQUAL 14 AND NOT CMAKE_CXX_STANDARD)
+      message(FATAL_ERROR "Set CMAKE_CXX_STANDARD to 14")
+    ENDIF()
     GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip)
     IF(DEFINED ENV{ROCM_PATH})
       GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH})
diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake
index 34e45ecf72..9a55d2a5e9 100644
--- a/lib/kokkos/cmake/kokkos_tribits.cmake
+++ b/lib/kokkos/cmake/kokkos_tribits.cmake
@@ -534,13 +534,6 @@ FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME)
 IF (KOKKOS_HAS_TRILINOS)
   TRIBITS_ADD_LIBRARY(${NAME} ${ARGN})
 ELSE()
-  CMAKE_PARSE_ARGUMENTS(PARSE
-    ""
-    ""
-    "HEADERS;SOURCES"
-    ${ARGN}
-  )
-
   ADD_LIBRARY(${NAME} INTERFACE)
   KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME})
 ENDIF()
diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
index a2b68064de..8450c06077 100644
--- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp
@@ -915,8 +915,8 @@ inline void deep_copy(const View<T, DP...>& dst,
 template <class T, class... DP, class... SP>
 inline void deep_copy(const Kokkos::Experimental::DynamicView<T, DP...>& dst,
                       const View<T, SP...>& src) {
-  using dst_type = Kokkos::Experimental::DynamicView<T, SP...>;
-  using src_type = View<T, DP...>;
+  using dst_type = Kokkos::Experimental::DynamicView<T, DP...>;
+  using src_type = View<T, SP...>;
 
   using dst_execution_space = typename ViewTraits<T, DP...>::execution_space;
   using src_memory_space    = typename ViewTraits<T, SP...>::memory_space;
diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
index 5345f8ea24..303e3643c9 100644
--- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp
@@ -240,6 +240,83 @@ struct TestDynamicView {
       ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2));
 #endif
     }  // end scope
+
+    // Test: Reproducer to demonstrate compile-time error of deep_copy
+    // of DynamicView to/from on-host View.
+    //   Case 4:
+    {
+      using device_view_type = Kokkos::View<Scalar*, Space>;
+      using host_view_type = typename Kokkos::View<Scalar*, Space>::HostMirror;
+
+      view_type device_dynamic_view("on-device DynamicView", 1024,
+                                    arg_total_size);
+      device_view_type device_view("on-device View", arg_total_size);
+      host_view_type host_view("on-host View", arg_total_size);
+
+      unsigned da_size = arg_total_size / 8;
+      device_dynamic_view.resize_serial(da_size);
+
+      // Use parallel_for to populate device_dynamic_view and verify values
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+      Kokkos::parallel_for(
+          Kokkos::RangePolicy<execution_space>(0, da_size),
+          KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); });
+
+      value_type result_sum = 0.0;
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<execution_space>(0, da_size),
+          KOKKOS_LAMBDA(const int i, value_type& partial_sum) {
+            partial_sum += (value_type)device_dynamic_view(i);
+          },
+          result_sum);
+
+      ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2));
+#endif
+
+      // Use an on-device View as intermediate to deep_copy the
+      // device_dynamic_view to host, zero out the device_dynamic_view,
+      // deep_copy from host back to the device_dynamic_view and verify
+      Kokkos::deep_copy(device_view, device_dynamic_view);
+      Kokkos::deep_copy(host_view, device_view);
+      Kokkos::deep_copy(device_view, host_view);
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+      Kokkos::parallel_for(
+          Kokkos::RangePolicy<execution_space>(0, da_size),
+          KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); });
+#endif
+      Kokkos::deep_copy(device_dynamic_view, device_view);
+#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
+      value_type new_result_sum = 0.0;
+      Kokkos::parallel_reduce(
+          Kokkos::RangePolicy<execution_space>(0, da_size),
+          KOKKOS_LAMBDA(const int i, value_type& partial_sum) {
+            partial_sum += (value_type)device_dynamic_view(i);
+          },
+          new_result_sum);
+
+      ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2));
+#endif
+
+      // Try to deep_copy device_dynamic_view directly to/from host.
+      // host-to-device currently fails to compile because DP and SP are
+      // swapped in the deep_copy implementation.
+      // Once that's fixed, both deep_copy's will fail at runtime because the
+      // destination execution space cannot access the source memory space.
+      try {
+        Kokkos::deep_copy(host_view, device_dynamic_view);
+      } catch (std::runtime_error const& error) {
+        std::string msg = error.what();
+        std::cerr << "Copy from on-device DynamicView to on-host View failed:\n"
+                  << msg << std::endl;
+      }
+      try {
+        Kokkos::deep_copy(device_dynamic_view, host_view);
+      } catch (std::runtime_error const& error) {
+        std::string msg = error.what();
+        std::cerr << "Copy from on-host View to on-device DynamicView failed:\n"
+                  << msg << std::endl;
+      }
+    }
   }
 };
 
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
index 54824e5b39..094c40bfa7 100644
--- a/lib/kokkos/core/perf_test/test_atomic.cpp
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -73,7 +73,7 @@ void textcolor(int attr, int fg, int bg) {
   char command[40];
 
   /* Command is the control command to the terminal */
-  sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
+  snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
   printf("%s", command);
 }
 void textcolor_standard() { textcolor(RESET, BLACK, WHITE); }
diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
index 3785cfe80b..bdba322fcc 100644
--- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
+++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -428,6 +428,8 @@ void HIPInternal::finalize() {
 
   if (this == &singleton()) {
     (void)Kokkos::Impl::hip_global_unique_token_locks(true);
+    Kokkos::Impl::finalize_host_hip_lock_arrays();
+
     KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging));
     KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable));
   }
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index e42944d819..8d2aa4d188 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -228,11 +228,6 @@
 #define KOKKOS_ENABLE_PRAGMA_SIMD 1
 #endif
 
-// FIXME Workaround for ICE with intel 17,18,19,20,21 in Trilinos
-#if (KOKKOS_COMPILER_INTEL <= 2100)
-#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
-#endif
-
 // FIXME_SYCL
 #if !defined(KOKKOS_ENABLE_SYCL)
 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1
@@ -653,7 +648,8 @@ static constexpr bool kokkos_omp_on_host() { return false; }
 #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) ||  \
      defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \
     !defined(_WIN32)
-#if (!defined(__linux__) || defined(__GLIBC_MINOR__))
+// disable stacktrace for musl-libc
+#if !defined(__linux__) || defined(__GLIBC_MINOR__)
 #define KOKKOS_IMPL_ENABLE_STACKTRACE
 #endif
 #define KOKKOS_IMPL_ENABLE_CXXABI
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
index 3e37eb61dc..a1d77071f5 100644
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -73,9 +73,8 @@ class ScratchMemorySpace {
       "Instantiating ScratchMemorySpace on non-execution-space type.");
 
  public:
-  // Alignment of memory chunks returned by 'get'
-  // must be a power of two
-  enum { ALIGN = 8 };
+  // Minimal overalignment used by view scratch allocations
+  constexpr static int ALIGN = 8;
 
  private:
   mutable char* m_iter_L0 = nullptr;
@@ -87,7 +86,9 @@ class ScratchMemorySpace {
   mutable int m_offset        = 0;
   mutable int m_default_level = 0;
 
-  enum { MASK = ALIGN - 1 };  // Alignment used by View::shmem_size
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  constexpr static int DEFAULT_ALIGNMENT_MASK = ALIGN - 1;
+#endif
 
  public:
   //! Tag this class as a memory space
@@ -101,39 +102,59 @@ class ScratchMemorySpace {
 
   static constexpr const char* name() { return "ScratchMemorySpace"; }
 
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
+  // This function is unused
   template <typename IntType>
-  KOKKOS_INLINE_FUNCTION static IntType align(const IntType& size) {
-    return (size + MASK) & ~MASK;
+  KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static constexpr IntType align(
+      const IntType& size) {
+    return (size + DEFAULT_ALIGNMENT_MASK) & ~DEFAULT_ALIGNMENT_MASK;
   }
+#endif
 
   template <typename IntType>
   KOKKOS_INLINE_FUNCTION void* get_shmem(const IntType& size,
                                          int level = -1) const {
-    return get_shmem_common</*aligned*/ false>(size, 1, level);
+    return get_shmem_common</*alignment_requested*/ false>(size, 1, level);
   }
 
   template <typename IntType>
   KOKKOS_INLINE_FUNCTION void* get_shmem_aligned(const IntType& size,
                                                  const ptrdiff_t alignment,
                                                  int level = -1) const {
-    return get_shmem_common</*aligned*/ true>(size, alignment, level);
+    return get_shmem_common</*alignment_requested*/ true>(size, alignment,
+                                                          level);
   }
 
  private:
-  template <bool aligned, typename IntType>
+  template <bool alignment_requested, typename IntType>
   KOKKOS_INLINE_FUNCTION void* get_shmem_common(const IntType& size,
                                                 const ptrdiff_t alignment,
                                                 int level = -1) const {
     if (level == -1) level = m_default_level;
-    auto& m_iter              = (level == 0) ? m_iter_L0 : m_iter_L1;
-    auto& m_end               = (level == 0) ? m_end_L0 : m_end_L1;
-    char* previous            = m_iter;
-    const ptrdiff_t missalign = size_t(m_iter) % alignment;
-    if (missalign) m_iter += alignment - missalign;
+    auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1;
+    auto& m_end  = (level == 0) ? m_end_L0 : m_end_L1;
 
-    void* tmp = m_iter + m_offset * (aligned ? size : align(size));
-    if (m_end < (m_iter += (aligned ? size : align(size)) * m_multiplier)) {
-      m_iter = previous;  // put it back like it was
+    if (alignment_requested) {
+      const ptrdiff_t missalign = size_t(m_iter) % alignment;
+      if (missalign) m_iter += alignment - missalign;
+    }
+
+    // This is each thread's start pointer for its allocation
+    // Note: for team scratch m_offset is 0, since every
+    // thread will get back the same shared pointer
+    void* tmp           = m_iter + m_offset * size;
+    ptrdiff_t increment = size * m_multiplier;
+
+    // increment m_iter first and decrement it again if not
+    // enough memory was available. In the non-failing path
+    // this will save instructions.
+    m_iter += increment;
+
+    if (m_end < m_iter) {
+      // Request did overflow: reset the base team ptr, and
+      // return nullptr
+      m_iter -= increment;
+      tmp = nullptr;
 #ifdef KOKKOS_ENABLE_DEBUG
       // mfh 23 Jun 2015: printf call consumes 25 registers
       // in a CUDA build, so only print in debug mode.  The
@@ -143,7 +164,6 @@ class ScratchMemorySpace {
           "%ld byte(s); remaining capacity is %ld byte(s)\n",
           long(size), long(m_end - m_iter));
 #endif  // KOKKOS_ENABLE_DEBUG
-      tmp = nullptr;
     }
     return tmp;
   }
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
index ffdd1e9fc8..b0d1d693fc 100644
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -203,19 +203,11 @@ class Serial {
   static const char* name();
 
   Impl::SerialInternal* impl_internal_space_instance() const {
-#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
-    return m_space_instance;
-#else
     return m_space_instance.get();
-#endif
   }
 
  private:
-#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
-  Impl::SerialInternal* m_space_instance;
-#else
   Kokkos::Impl::HostSharedPtr<Impl::SerialInternal> m_space_instance;
-#endif
   //--------------------------------------------------------------------------
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index f8dcfc869e..795ef91e6a 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -67,6 +67,8 @@ KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.")
 
 #include <impl/Kokkos_Tools.hpp>
 
+#include <Kokkos_MinMaxClamp.hpp>
+
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------
 
@@ -1692,19 +1694,27 @@ class View : public ViewTraits<DataType, Properties...> {
         arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7));
   }
 
+ private:
+  // Want to be able to align to minimum scratch alignment or sizeof or alignof
+  // elements
+  static constexpr size_t scratch_value_alignment =
+      ::Kokkos::max(::Kokkos::max(sizeof(typename traits::value_type),
+                                  alignof(typename traits::value_type)),
+                    static_cast<size_t>(
+                        traits::execution_space::scratch_memory_space::ALIGN));
+
+ public:
   static KOKKOS_INLINE_FUNCTION size_t
   shmem_size(typename traits::array_layout const& arg_layout) {
-    return map_type::memory_span(arg_layout) +
-           sizeof(typename traits::value_type);
+    return map_type::memory_span(arg_layout) + scratch_value_alignment;
   }
 
   explicit KOKKOS_INLINE_FUNCTION View(
       const typename traits::execution_space::scratch_memory_space& arg_space,
       const typename traits::array_layout& arg_layout)
-      : View(Impl::ViewCtorProp<pointer_type>(
-                 reinterpret_cast<pointer_type>(arg_space.get_shmem_aligned(
-                     map_type::memory_span(arg_layout),
-                     sizeof(typename traits::value_type)))),
+      : View(Impl::ViewCtorProp<pointer_type>(reinterpret_cast<pointer_type>(
+                 arg_space.get_shmem_aligned(map_type::memory_span(arg_layout),
+                                             scratch_value_alignment))),
              arg_layout) {}
 
   explicit KOKKOS_INLINE_FUNCTION View(
@@ -1722,7 +1732,7 @@ class View : public ViewTraits<DataType, Properties...> {
                      map_type::memory_span(typename traits::array_layout(
                          arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6,
                          arg_N7)),
-                     sizeof(typename traits::value_type)))),
+                     scratch_value_alignment))),
              typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3,
                                            arg_N4, arg_N5, arg_N6, arg_N7),
              check_input_args::yes) {
diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp
index 9205e82560..25d220e45d 100644
--- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp
+++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp
@@ -173,14 +173,8 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes,
 }  // namespace Impl
 
 Serial::Serial()
-#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS
-    : m_space_instance(&Impl::SerialInternal::singleton()) {
-}
-#else
     : m_space_instance(&Impl::SerialInternal::singleton(),
-                       [](Impl::SerialInternal*) {}) {
-}
-#endif
+                       [](Impl::SerialInternal*) {}) {}
 
 void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const {
   os << "Host Serial Execution Space:\n";
diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
index d726a86f76..c3e28c59f4 100644
--- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
+++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp
@@ -63,11 +63,10 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
 
   const FunctorType m_functor;
   const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;
 
   void exec() const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+    const typename Policy::member_type e = m_mdr_policy.m_num_tiles;
+    for (typename Policy::member_type i = 0; i < e; ++i) {
       iterate_type(m_mdr_policy, m_functor)(i);
     }
   }
@@ -85,9 +84,7 @@ class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
   }
   inline ParallelFor(const FunctorType& arg_functor,
                      const MDRangePolicy& arg_policy)
-      : m_functor(arg_functor),
-        m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
+      : m_functor(arg_functor), m_mdr_policy(arg_policy) {}
 };
 
 template <class FunctorType, class ReducerType, class... Traits>
@@ -120,13 +117,12 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
 
   const FunctorType m_functor;
   const MDRangePolicy m_mdr_policy;
-  const Policy m_policy;
   const ReducerType m_reducer;
   const pointer_type m_result_ptr;
 
   inline void exec(reference_type update) const {
-    const typename Policy::member_type e = m_policy.end();
-    for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) {
+    const typename Policy::member_type e = m_mdr_policy.m_num_tiles;
+    for (typename Policy::member_type i = 0; i < e; ++i) {
       iterate_type(m_mdr_policy, m_functor, update)(i);
     }
   }
@@ -148,7 +144,8 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
     const size_t team_shared_size  = 0;  // Never shrinks
     const size_t thread_local_size = 0;  // Never shrinks
 
-    auto* internal_instance = m_policy.space().impl_internal_space_instance();
+    auto* internal_instance =
+        m_mdr_policy.space().impl_internal_space_instance();
     // Need to lock resize_thread_team_data
     std::lock_guard<std::mutex> lock(
         internal_instance->m_thread_team_data_mutex);
@@ -181,7 +178,6 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                                   void*> = nullptr)
       : m_functor(arg_functor),
         m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
         m_reducer(InvalidType()),
         m_result_ptr(arg_result_view.data()) {
     static_assert(Kokkos::is_view<HostViewType>::value,
@@ -197,7 +193,6 @@ class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
                         MDRangePolicy arg_policy, const ReducerType& reducer)
       : m_functor(arg_functor),
         m_mdr_policy(arg_policy),
-        m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
         m_reducer(reducer),
         m_result_ptr(reducer.view().data()) {
     /*static_assert( std::is_same< typename ViewType::memory_space
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
index a5bd003237..fc89e485fd 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -1165,6 +1165,5 @@ void _kokkos_pgi_compiler_bug_workaround() {}
 #endif
 }  // namespace Kokkos
 
-Kokkos::Impl::InitializationSettingsHelper<std::string>::storage_type const
-    Kokkos::Impl::InitializationSettingsHelper<std::string>::unspecified =
-        "some string we don't expect user would ever provide";
+constexpr char
+    Kokkos::Impl::InitializationSettingsHelper<std::string>::unspecified[];
diff --git a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
index ceb35f0247..00b2335fd7 100644
--- a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp
@@ -104,7 +104,9 @@ struct InitializationSettingsHelper<std::string> {
   using value_type   = std::string;
   using storage_type = std::string;
 
-  static storage_type const unspecified;
+  // prefer c-string to avoid static initialization order nightmare
+  static constexpr char unspecified[] =
+      "some string we don't expect user would ever provide";
 };
 }  // namespace Impl
 
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
index 480b1a392b..796552cead 100644
--- a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp
@@ -655,9 +655,9 @@ void initialize(const std::string& profileLibrary) {
 
   char* envProfileLibrary = const_cast<char*>(profileLibrary.c_str());
 
-  const auto envProfileCopy =
-      std::make_unique<char[]>(strlen(envProfileLibrary) + 1);
-  sprintf(envProfileCopy.get(), "%s", envProfileLibrary);
+  const size_t envProfileLen = strlen(envProfileLibrary) + 1;
+  const auto envProfileCopy  = std::make_unique<char[]>(envProfileLen);
+  snprintf(envProfileCopy.get(), envProfileLen, "%s", envProfileLibrary);
 
   char* profileLibraryName = strtok(envProfileCopy.get(), ";");
 
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
index d915b7e472..55a29f1128 100644
--- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp
@@ -69,9 +69,10 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
   nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) +
           (do_other ? 4 : 0) + (do_tune ? 1 : 0);
 
-  char** args_kokkos = new char*[nargs];
+  char** args_kokkos      = new char*[nargs];
+  const int max_args_size = 45;
   for (int i = 0; i < nargs; i++) {
-    args_kokkos[i] = new char[45];
+    args_kokkos[i] = new char[max_args_size];
     delete_these.insert(args_kokkos[i]);
   }
 
@@ -112,7 +113,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
 #endif
 
     init_args.num_threads = nthreads;
-    sprintf(args_kokkos[threads_idx], "--threads=%i", nthreads);
+    snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads);
   }
 
   if (do_numa) {
@@ -130,24 +131,27 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device,
 #endif
 
     init_args.num_numa = numa;
-    sprintf(args_kokkos[numa_idx], "--numa=%i", numa);
+    snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa);
   }
 
   if (do_device) {
     init_args.device_id = 0;
-    sprintf(args_kokkos[device_idx], "--device-id=%i", 0);
+    snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0);
   }
 
   if (do_other) {
-    sprintf(args_kokkos[0], "--dummyarg=1");
-    sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], "--dummy2arg");
-    sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], "dummy3arg");
-    sprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], "dummy4arg=1");
+    snprintf(args_kokkos[0], max_args_size, "--dummyarg=1");
+    snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size,
+             "--dummy2arg");
+    snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size,
+             "dummy3arg");
+    snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size,
+             "dummy4arg=1");
   }
 
   if (do_tune) {
     init_args.tune_internals = true;
-    sprintf(args_kokkos[tune_idx], "--kokkos-tune-internals");
+    snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals");
   }
 
   return args_kokkos;
diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
index f66b35dc9f..18f3c9a777 100644
--- a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
+++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -91,7 +91,7 @@ void test_shared_alloc() {
     // Since always executed on host space, leave [=]
     Kokkos::parallel_for(range, [=](int i) {
       char name[64];
-      sprintf(name, "test_%.2d", i);
+      snprintf(name, 64, "test_%.2d", i);
 
       r[i] = RecordMemS::allocate(s, name, size * (i + 1));
       h[i] = Header::get_header(r[i]->data());
@@ -135,7 +135,7 @@ void test_shared_alloc() {
 
     Kokkos::parallel_for(range, [=](size_t i) {
       char name[64];
-      sprintf(name, "test_%.2d", int(i));
+      snprintf(name, 64, "test_%.2d", int(i));
 
       RecordFull* rec = RecordFull::allocate(s, name, size * (i + 1));
 
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
index 3f05b2ef66..58db4360e5 100644
--- a/lib/kokkos/core/unit_test/TestTeam.hpp
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -1551,14 +1551,16 @@ struct TestScratchAlignment {
     double x, y, z;
   };
   TestScratchAlignment() {
-    test(true);
-    test(false);
+    test_view(true);
+    test_view(false);
+    test_minimal();
+    test_raw();
   }
   using ScratchView =
       Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>;
   using ScratchViewInt =
       Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
-  void test(bool allocate_small) {
+  void test_view(bool allocate_small) {
     int shmem_size = ScratchView::shmem_size(11);
 #ifdef KOKKOS_ENABLE_OPENMPTARGET
     int team_size =
@@ -1580,12 +1582,68 @@ struct TestScratchAlignment {
         });
     Kokkos::fence();
   }
+
+  void test_minimal() {
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+    Kokkos::TeamPolicy<ExecSpace> policy(1, 1);
+    size_t scratch_size = sizeof(int);
+    Kokkos::View<int, ExecSpace> flag("Flag");
+
+    Kokkos::parallel_for(
+        policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
+        KOKKOS_LAMBDA(const member_type &team) {
+          int *scratch_ptr = (int *)team.team_shmem().get_shmem(scratch_size);
+          if (scratch_ptr == nullptr) flag() = 1;
+        });
+    Kokkos::fence();
+    int minimal_scratch_allocation_failed = 0;
+    Kokkos::deep_copy(minimal_scratch_allocation_failed, flag);
+    ASSERT_TRUE(minimal_scratch_allocation_failed == 0);
+  }
+
+  void test_raw() {
+    using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
+    Kokkos::TeamPolicy<ExecSpace> policy(1, 1);
+    Kokkos::View<int, ExecSpace> flag("Flag");
+
+    Kokkos::parallel_for(
+        policy.set_scratch_size(0, Kokkos::PerTeam(1024)),
+        KOKKOS_LAMBDA(const member_type &team) {
+          int *scratch_ptr1 = (int *)team.team_shmem().get_shmem(24);
+          int *scratch_ptr2 = (int *)team.team_shmem().get_shmem(32);
+          int *scratch_ptr3 = (int *)team.team_shmem().get_shmem(12);
+
+          if ((int(scratch_ptr2 - scratch_ptr1) != 6) ||
+              (int(scratch_ptr3 - scratch_ptr2) != 8))
+            flag() = 1;
+
+          if (((scratch_ptr3 - static_cast<int *>(nullptr)) + 3) % 2 == 1)
+            scratch_ptr1 = (int *)team.team_shmem().get_shmem_aligned(24, 4);
+          else {
+            scratch_ptr1 = (int *)team.team_shmem().get_shmem_aligned(12, 4);
+          }
+          scratch_ptr2 = (int *)team.team_shmem().get_shmem_aligned(32, 8);
+          scratch_ptr3 = (int *)team.team_shmem().get_shmem_aligned(8, 4);
+
+          if ((int(scratch_ptr2 - scratch_ptr1) != 7) &&
+              (int(scratch_ptr2 - scratch_ptr1) != 4))
+            flag() = 1;
+          if (int(scratch_ptr3 - scratch_ptr2) != 8) flag() = 1;
+          if ((int(size_t(scratch_ptr1) % 4) != 0) ||
+              (int(size_t(scratch_ptr2) % 8) != 0) ||
+              (int(size_t(scratch_ptr3) % 4) != 0))
+            flag() = 1;
+        });
+    Kokkos::fence();
+    int raw_get_shmem_alignment_failed = 0;
+    Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag);
+    ASSERT_TRUE(raw_get_shmem_alignment_failed == 0);
+  }
 };
 
 }  // namespace
 
 namespace {
-
 template <class ExecSpace>
 struct TestTeamPolicyHandleByValue {
   using scalar     = double;
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
index 31fd63f084..c4381b29e7 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp
@@ -59,9 +59,11 @@ __global__ void offset(int* p) {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // Cuda.
 TEST(cuda, raw_cuda_interop) {
+  // Make sure that we use the same device for all allocations
+  Kokkos::initialize();
+
   int* p;
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100));
-  Kokkos::initialize();
 
   Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100);
   Kokkos::deep_copy(v, 5);
diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
index f11f657e00..69aef6f8d8 100644
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp
@@ -48,9 +48,11 @@
 namespace Test {
 // Test Interoperability with Cuda Streams
 TEST(cuda, raw_cuda_streams) {
+  // Make sure that we use the same device for all allocations
+  Kokkos::initialize();
+
   cudaStream_t stream;
   cudaStreamCreate(&stream);
-  Kokkos::initialize();
   int* p;
   cudaMalloc(&p, sizeof(int) * 100);
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
index af20e753d4..ce76076f9b 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp
@@ -59,9 +59,11 @@ __global__ void offset(int* p) {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // HIP.
 TEST(hip, raw_hip_interop) {
+  // Make sure that we use the same device for all allocations
+  Kokkos::initialize();
+
   int* p;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
-  Kokkos::initialize();
 
   Kokkos::View<int*, Kokkos::MemoryTraits<Kokkos::Unmanaged>> v(p, 100);
   Kokkos::deep_copy(v, 5);
diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
index 95d102d4d1..6c7fdd7044 100644
--- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp
@@ -50,9 +50,11 @@ namespace Test {
 // The difference with the CUDA tests are: raw HIP vs raw CUDA and no launch
 // bound in HIP due to an error when computing the block size.
 TEST(hip, raw_hip_streams) {
+  // Make sure that we use the same device for all allocations
+  Kokkos::initialize();
+
   hipStream_t stream;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream));
-  Kokkos::initialize();
   int* p;
   KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100));
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
index e45d990745..1189aba26f 100644
--- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp
@@ -52,8 +52,8 @@ namespace Test {
 // Test whether allocations survive Kokkos initialize/finalize if done via Raw
 // SYCL.
 TEST(sycl, raw_sycl_interop) {
+  // Make sure all queues use the same context
   Kokkos::initialize();
-
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
index 114d2a4aa2..7b9a664304 100644
--- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp
@@ -51,6 +51,7 @@ namespace Test {
 
 // Test whether external allocations can be accessed by the default queue.
 TEST(sycl, raw_sycl_interop_context_1) {
+  // Make sure all queues use the same context
   Kokkos::Experimental::SYCL default_space;
   sycl::context default_context = default_space.sycl_queue().get_context();
 
diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
index 8ffada1dab..be093d8edc 100644
--- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
+++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp
@@ -48,9 +48,13 @@
 namespace Test {
 // Test Interoperability with SYCL Streams
 TEST(sycl, raw_sycl_queues) {
-  sycl::default_selector device_selector;
-  sycl::queue queue(device_selector);
+  // Make sure all queues use the same context
   Kokkos::initialize();
+  Kokkos::Experimental::SYCL default_space;
+  sycl::context default_context = default_space.sycl_queue().get_context();
+
+  sycl::default_selector device_selector;
+  sycl::queue queue(default_context, device_selector);
   int* p            = sycl::malloc_device<int>(100, queue);
   using MemorySpace = typename TEST_EXECSPACE::memory_space;
 
diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
index 1815adb4a7..b55be52264 100644
--- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
+++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp
@@ -76,7 +76,7 @@ namespace Impl {
 /// instances in other translation units, we must update this CUDA global
 /// variable based on the Host global variable prior to running any kernels
 /// that will use it.
-/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
+/// That is the purpose of the ensure_cuda_lock_arrays_on_device function.
 __device__
 #ifdef __CUDACC_RDC__
     __constant__ extern
@@ -138,33 +138,42 @@ namespace {
 static int lock_array_copied = 0;
 inline int eliminate_warning_for_lock_array() { return lock_array_copied; }
 }  // namespace
+
+#ifdef __CUDACC_RDC__
+inline
+#else
+inline static
+#endif
+    void
+    copy_cuda_lock_arrays_to_device() {
+  if (lock_array_copied == 0) {
+    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE,
+                       &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h,
+                       sizeof(int32_t*));
+    cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE,
+                       &CUDA_SPACE_ATOMIC_LOCKS_NODE_h,
+                       sizeof(int32_t*));
+  }
+  lock_array_copied = 1;
+}
+
 }  // namespace Impl
 }  // namespace desul
-/* It is critical that this code be a macro, so that it will
-   capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE
-   putting this in an inline function will NOT do the right thing! */
-#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()                       \
-  {                                                                        \
-    if (::desul::Impl::lock_array_copied == 0) {                           \
-      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE,    \
-                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \
-                         sizeof(int32_t*));                                \
-      cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE,      \
-                         &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h,   \
-                         sizeof(int32_t*));                                \
-    }                                                                      \
-    ::desul::Impl::lock_array_copied = 1;                                  \
-  }
 
 #endif /* defined( __CUDACC__ ) */
 
 #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */
 
+namespace desul {
+
 #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__))
-#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
+inline void ensure_cuda_lock_arrays_on_device() {}
 #else
-#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
+static inline void ensure_cuda_lock_arrays_on_device() {
+  Impl::copy_cuda_lock_arrays_to_device();
+}
 #endif
 
-#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */
+}  // namespace desul
+
+#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */
diff --git a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
index cb8482c5da..19944b378e 100644
--- a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
+++ b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp
@@ -70,7 +70,7 @@ void init_lock_arrays_cuda() {
                              "init_lock_arrays_cuda: cudaMalloc host locks");
 
   auto error_sync1 = cudaDeviceSynchronize();
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
   check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs");
   init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>();
   auto error_sync2 = cudaDeviceSynchronize();
@@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() {
   CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr;
   CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr;
 #ifdef __CUDACC_RDC__
-  DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
+  copy_cuda_lock_arrays_to_device();
 #endif
 }