Update Kokkos library in LAMMPS to v4.4.1

2024-09-13 12:14:49 -06:00
parent 3079d51eaf
commit 487f7ade68
23 changed files with 470 additions and 80 deletions
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,20 @@
 # CHANGELOG

+## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01)
+
+### Features:
+* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229)
+
+### Backend and Architecture Enhancements:
+
+#### CUDA:
+* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823)
+
+### Bug Fixes
+* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284)
+* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296)
+
 ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -151,7 +151,7 @@ ENDIF()

 set(Kokkos_VERSION_MAJOR 4)
 set(Kokkos_VERSION_MINOR 4)
-set(Kokkos_VERSION_PATCH 0)
+set(Kokkos_VERSION_PATCH 1)
 set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
 message(STATUS "Kokkos version: ${Kokkos_VERSION}")
 math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -12,7 +12,7 @@ endif

 KOKKOS_VERSION_MAJOR = 4
 KOKKOS_VERSION_MINOR = 4
-KOKKOS_VERSION_PATCH = 0
+KOKKOS_VERSION_PATCH = 1
 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

 # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
--- a/lib/kokkos/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/cmake/KokkosCore_config.h.in
@ -37,6 +37,7 @@
 #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA  // deprecated
 #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
 #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
+#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
 #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
 #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
 #cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY
--- a/lib/kokkos/cmake/kokkos_enable_options.cmake
+++ b/lib/kokkos/cmake/kokkos_enable_options.cmake
@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
 # resolved but we keep the option around a bit longer to be safe.
 KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON  "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
 KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
+KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
+
 KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4    ON "Whether code deprecated in major release 4 is available" )
 KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
 KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE  OFF "Whether to enable relocatable device code (RDC) for HIP")
@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options)
  ENDIF()
 ENDFUNCTION()

-CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
+CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY)
 CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
 CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)

--- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
+++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
@ -37,6 +37,17 @@
 #endif
 ///@}

+/// Some tests are skipped for unified memory space
+#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE                               \
+  if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space,    \
+                               Kokkos::CudaSpace>)                       \
+    GTEST_SKIP() << "skipping since unified memory requires additional " \
+                    "fences";
+#else
+#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
+#endif
+
 TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
  using namespace Kokkos::Test::Tools;
  listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {

 TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
  GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
+  GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE

  using namespace Kokkos::Test::Tools;
  listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -31,7 +31,6 @@
 #include <algorithm>
 #include <atomic>

-//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <impl/Kokkos_Error.hpp>

 #include <impl/Kokkos_Tools.hpp>
@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id,
  cudaError_t error_code = cudaSuccess;
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  // This is intended for Grace-Hopper (and future unified memory architectures)
+  // The idea is to use host allocator and then advise to keep it in HBM on the
+  // device, but that requires CUDA 12.2
+  static_assert(CUDART_VERSION >= 12020,
+                "CUDA runtime version >=12.2 required when "
+                "Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. "
+                "Please update your CUDA runtime version or "
+                "reconfigure with "
+                "-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF");
+  if (arg_alloc_size) {  // cudaMemAdvise_v2 does not work with nullptr
+    error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
+    if (error_code == cudaSuccess) {
+      // One would think cudaMemLocation{device_id,
+      // cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of
+      // members doesn't seem to be defined.
+      cudaMemLocation loc;
+      loc.id   = device_id;
+      loc.type = cudaMemLocationTypeDevice;
+      KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
+          ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
+    }
+  }
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
  if (arg_alloc_size >= memory_threshold_g) {
    error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id,
            "Kokkos::Cuda: backend fence after async malloc");
      }
    }
-  } else
+  } else {
+    error_code = cudaMalloc(&ptr, arg_alloc_size);
+  }
+#else
+  error_code = cudaMalloc(&ptr, arg_alloc_size);
 #endif
-  { error_code = cudaMalloc(&ptr, arg_alloc_size); }
+
  if (error_code != cudaSuccess) {  // TODO tag as unlikely branch
    // This is the only way to clear the last error, which
    // we should do here since we're turning it into an
@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate(
  }
 #ifndef CUDART_VERSION
 #error CUDART_VERSION undefined!
+#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
+  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
 #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
  if (arg_alloc_size >= memory_threshold_g) {
    Impl::cuda_device_synchronize(
@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,

 #include <impl/Kokkos_SharedAlloc_timpl.hpp>

+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
    Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
    Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
@ -88,6 +88,19 @@ class CudaSpace {
  void* allocate(const char* arg_label, const size_t arg_alloc_size,
                 const size_t arg_logical_size = 0) const;

+#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
+    return allocate(arg_alloc_size);
+  }
+  template <typename ExecutionSpace>
+  void* allocate(const ExecutionSpace&, const char* arg_label,
+                 const size_t arg_alloc_size,
+                 const size_t arg_logical_size = 0) const {
+    return allocate(arg_label, arg_alloc_size, arg_logical_size);
+  }
+#endif
+
  /**\brief  Deallocate untracked memory in the cuda space */
  void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
  void deallocate(const char* arg_label, void* const arg_alloc_ptr,
@ -337,7 +350,11 @@ static_assert(
 template <>
 struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
  enum : bool { assignable = false };
-  enum : bool { accessible = false };
+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
+  enum : bool{accessible = false};
+#else
+  enum : bool { accessible = true };
+#endif
  enum : bool { deepcopy = true };
 };

@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
 //----------------------------------------------------------------------------
 //----------------------------------------------------------------------------

+#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
 KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
    Kokkos::CudaSpace);
+#else
+KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
+#endif
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
 KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default

  //----------------------------------

+#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+  // Check if unified memory is available
+  int cuda_result;
+  cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
+                         cuda_device_id);
+  if (cuda_result == 0) {
+    Kokkos::abort(
+        "Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
+        "this device\n"
+        "Please recompile Kokkos with "
+        "-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
+  }
+#endif
+
+  //----------------------------------
+
  cudaStream_t singleton_stream;
  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
  KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
@ -705,6 +721,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
 #else
  os << "no\n";
 #endif
+#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
+  os << "  KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
+  os << "yes\n";
+#endif

  os << "\nCuda Runtime Configuration:\n";

--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@ -571,6 +571,8 @@ inline constexpr Kokkos::ALL_t ALL{};
 #pragma omp end declare target
 #endif

+inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{};
+
 inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};

 inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp
@ -113,7 +113,7 @@ int OpenMP::impl_thread_pool_size() const noexcept {
 }

 int OpenMP::impl_max_hardware_threads() noexcept {
-  return Impl::g_openmp_hardware_max_threads;
+  return Impl::OpenMPInternal::max_hardware_threads();
 }

 namespace Impl {
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
@ -31,12 +31,20 @@
 #include <sstream>
 #include <thread>

+namespace {
+int g_openmp_hardware_max_threads = 1;
+}
+
 namespace Kokkos {
 namespace Impl {

 std::vector<OpenMPInternal *> OpenMPInternal::all_instances;
 std::mutex OpenMPInternal::all_instances_mutex;

+int OpenMPInternal::max_hardware_threads() noexcept {
+  return g_openmp_hardware_max_threads;
+}
+
 void OpenMPInternal::clear_thread_data() {
  const size_t member_bytes =
      sizeof(int64_t) *
@ -188,9 +196,9 @@ void OpenMPInternal::initialize(int thread_count) {
    // Before any other call to OMP query the maximum number of threads
    // and save the value for re-initialization unit testing.

-    Impl::g_openmp_hardware_max_threads = get_current_max_threads();
+    g_openmp_hardware_max_threads = get_current_max_threads();

-    int process_num_threads = Impl::g_openmp_hardware_max_threads;
+    int process_num_threads = g_openmp_hardware_max_threads;

    if (Kokkos::hwloc::available()) {
      process_num_threads = Kokkos::hwloc::get_available_numa_count() *
@ -203,11 +211,11 @@ void OpenMPInternal::initialize(int thread_count) {
    // process_num_threads if thread_count  > 0, set
    // g_openmp_hardware_max_threads to thread_count
    if (thread_count < 0) {
-      thread_count = Impl::g_openmp_hardware_max_threads;
+      thread_count = g_openmp_hardware_max_threads;
    } else if (thread_count == 0) {
-      if (Impl::g_openmp_hardware_max_threads != process_num_threads) {
-        Impl::g_openmp_hardware_max_threads = process_num_threads;
-        omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+      if (g_openmp_hardware_max_threads != process_num_threads) {
+        g_openmp_hardware_max_threads = process_num_threads;
+        omp_set_num_threads(g_openmp_hardware_max_threads);
      }
    } else {
      if (Kokkos::show_warnings() && thread_count > process_num_threads) {
@ -218,16 +226,16 @@ void OpenMPInternal::initialize(int thread_count) {
                  << ",  requested thread : " << std::setw(3) << thread_count
                  << std::endl;
      }
-      Impl::g_openmp_hardware_max_threads = thread_count;
-      omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
+      g_openmp_hardware_max_threads = thread_count;
+      omp_set_num_threads(g_openmp_hardware_max_threads);
    }

 // setup thread local
-#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
+#pragma omp parallel num_threads(g_openmp_hardware_max_threads)
    { Impl::SharedAllocationRecord<void, void>::tracking_enable(); }

    auto &instance       = OpenMPInternal::singleton();
-    instance.m_pool_size = Impl::g_openmp_hardware_max_threads;
+    instance.m_pool_size = g_openmp_hardware_max_threads;

    // New, unified host thread team data:
    {
@ -272,10 +280,9 @@ void OpenMPInternal::finalize() {
  if (this == &singleton()) {
    auto const &instance = singleton();
    // Silence Cuda Warning
-    const int nthreads =
-        instance.m_pool_size <= Impl::g_openmp_hardware_max_threads
-            ? Impl::g_openmp_hardware_max_threads
-            : instance.m_pool_size;
+    const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads
+                             ? g_openmp_hardware_max_threads
+                             : instance.m_pool_size;
    (void)nthreads;

 #pragma omp parallel num_threads(nthreads)
@ -284,7 +291,7 @@ void OpenMPInternal::finalize() {
    // allow main thread to track
    Impl::SharedAllocationRecord<void, void>::tracking_enable();

-    Impl::g_openmp_hardware_max_threads = 1;
+    g_openmp_hardware_max_threads = 1;
  }

  m_initialized = false;
@ -307,7 +314,7 @@ void OpenMPInternal::print_configuration(std::ostream &s) const {

  if (m_initialized) {
    const int numa_count      = 1;
-    const int core_per_numa   = Impl::g_openmp_hardware_max_threads;
+    const int core_per_numa   = g_openmp_hardware_max_threads;
    const int thread_per_core = 1;

    s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp
@ -47,8 +47,6 @@ namespace Impl {

 class OpenMPInternal;

-inline int g_openmp_hardware_max_threads = 1;
-
 struct OpenMPTraits {
  static constexpr int MAX_THREAD_COUNT = 512;
 };
@ -86,6 +84,8 @@ class OpenMPInternal {

  void clear_thread_data();

+  static int max_hardware_threads() noexcept;
+
  int thread_pool_size() const { return m_pool_size; }

  void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes,
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp
@ -105,7 +105,8 @@ class UniqueToken<OpenMP, UniqueTokenScope::Global> {
  /// \brief upper bound for acquired values, i.e. 0 <= value < size()
  KOKKOS_INLINE_FUNCTION
  int size() const noexcept {
-    KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;))
+    KOKKOS_IF_ON_HOST(
+        (return Kokkos::Impl::OpenMPInternal::max_hardware_threads();))

    KOKKOS_IF_ON_DEVICE((return 0;))
  }
--- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp
+++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp
@ -313,6 +313,51 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {

  void destroy_shared_allocation() {}
 };
+
+template <class DeviceType, class ValueType>
+struct ViewValueFunctorSequentialHostInit {
+  using ExecSpace = typename DeviceType::execution_space;
+  using MemSpace  = typename DeviceType::memory_space;
+  static_assert(SpaceAccessibility<HostSpace, MemSpace>::accessible);
+
+  ValueType* ptr;
+  size_t n;
+
+  ViewValueFunctorSequentialHostInit() = default;
+
+  ViewValueFunctorSequentialHostInit(ExecSpace const& /*arg_space*/,
+                                     ValueType* const arg_ptr,
+                                     size_t const arg_n,
+                                     std::string /*arg_name*/)
+      : ptr(arg_ptr), n(arg_n) {}
+
+  ViewValueFunctorSequentialHostInit(ValueType* const arg_ptr,
+                                     size_t const arg_n,
+                                     std::string /*arg_name*/)
+      : ptr(arg_ptr), n(arg_n) {}
+
+  void construct_shared_allocation() {
+    if constexpr (std::is_trivial_v<ValueType>) {
+      // value-initialization is equivalent to filling with zeros
+      std::memset(static_cast<void*>(ptr), 0, n * sizeof(ValueType));
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        new (ptr + i) ValueType();
+      }
+    }
+  }
+
+  void destroy_shared_allocation() {
+    if constexpr (std::is_trivially_destructible_v<ValueType>) {
+      // do nothing, don't bother calling the destructor
+    } else {
+      for (size_t i = 0; i < n; ++i) {
+        (ptr + i)->~ValueType();
+      }
+    }
+  }
+};
+
 }  // namespace Kokkos::Impl

 #endif  // KOKKOS_VIEW_ALLOC_HPP
--- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp
@ -23,12 +23,16 @@
 namespace Kokkos {
 namespace Impl {

+struct SequentialHostInit_t {};
 struct WithoutInitializing_t {};
 struct AllowPadding_t {};

 template <typename>
 struct is_view_ctor_property : public std::false_type {};

+template <>
+struct is_view_ctor_property<SequentialHostInit_t> : public std::true_type {};
+
 template <>
 struct is_view_ctor_property<WithoutInitializing_t> : public std::true_type {};

@ -84,10 +88,10 @@ struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T>> {

 /* Property flags have constexpr value */
 template <typename P>
-struct ViewCtorProp<
-    std::enable_if_t<std::is_same<P, AllowPadding_t>::value ||
-                     std::is_same<P, WithoutInitializing_t>::value>,
-    P> {
+struct ViewCtorProp<std::enable_if_t<std::is_same_v<P, AllowPadding_t> ||
+                                     std::is_same_v<P, WithoutInitializing_t> ||
+                                     std::is_same_v<P, SequentialHostInit_t>>,
+                    P> {
  ViewCtorProp()                     = default;
  ViewCtorProp(const ViewCtorProp &) = default;
  ViewCtorProp &operator=(const ViewCtorProp &) = default;
@ -199,6 +203,11 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
      Kokkos::Impl::has_type<AllowPadding_t, P...>::value;
  static constexpr bool initialize =
      !Kokkos::Impl::has_type<WithoutInitializing_t, P...>::value;
+  static constexpr bool sequential_host_init =
+      Kokkos::Impl::has_type<SequentialHostInit_t, P...>::value;
+  static_assert(initialize || !sequential_host_init,
+                "Incompatible WithoutInitializing and SequentialHostInit view "
+                "alloc properties");

  using memory_space    = typename var_memory_space::type;
  using execution_space = typename var_execution_space::type;
@ -251,7 +260,9 @@ auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop,
                (is_view_label<Property>::value &&
                 !ViewCtorProp<P...>::has_label) ||
                (std::is_same_v<Property, WithoutInitializing_t> &&
-                 ViewCtorProp<P...>::initialize)) {
+                 ViewCtorProp<P...>::initialize) ||
+                (std::is_same_v<Property, SequentialHostInit_t> &&
+                 !ViewCtorProp<P...>::sequential_host_init)) {
    using NewViewCtorProp = ViewCtorProp<P..., Property>;
    NewViewCtorProp new_view_ctor_prop(view_ctor_prop);
    static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value =
@ -299,7 +310,9 @@ struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> {
                  (is_view_label<Property>::value &&
                   !ViewCtorProp<P...>::has_label) ||
                  (std::is_same_v<Property, WithoutInitializing_t> &&
-                   ViewCtorProp<P...>::initialize)) {
+                   ViewCtorProp<P...>::initialize) ||
+                  (std::is_same_v<Property, SequentialHostInit_t> &&
+                   !ViewCtorProp<P...>::sequential_host_init)) {
      using NewViewCtorProp = ViewCtorProp<P..., Property>;
      NewViewCtorProp new_view_ctor_prop(view_ctor_prop);
      static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value =
--- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp
@ -2825,10 +2825,12 @@ class ViewMapping<
    using memory_space    = typename Traits::memory_space;
    static_assert(
        SpaceAccessibility<execution_space, memory_space>::accessible);
-    using value_type = typename Traits::value_type;
-    using functor_type =
-        ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
-                         value_type>;
+    using device_type  = Kokkos::Device<execution_space, memory_space>;
+    using value_type   = typename Traits::value_type;
+    using functor_type = std::conditional_t<
+        alloc_prop::sequential_host_init,
+        ViewValueFunctorSequentialHostInit<device_type, value_type>,
+        ViewValueFunctor<device_type, value_type>>;
    using record_type =
        Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;

--- a/lib/kokkos/core/unit_test/TestViewOfViews.hpp
+++ b/lib/kokkos/core/unit_test/TestViewOfViews.hpp
@ -20,7 +20,7 @@

 namespace {

-// User-defined type with a View data member
+// User-defined types with a View data member
 template <class V>
 class S {
  V v_;
@ -28,48 +28,102 @@ class S {
 public:
  template <class... Extents>
  S(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
-  S() = default;
+  KOKKOS_DEFAULTED_FUNCTION S() = default;
 };

 template <class V>
-void test_view_of_views() {
+class N {  // not default constructible
+  V v_;
+
+ public:
+  template <class... Extents>
+  N(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
+};
+
+template <class V>
+class H {  // constructible and destructible only from on the host side
+  V v_;
+
+ public:
+  template <class... Extents>
+  H(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
+  H() {}
+  ~H() {}
+};
+
+template <class V>
+void test_view_of_views_default() {
+  // assigning a default-constructed view to destruct the inner objects
  using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
-  {  // assigning a default-constructed view to destruct the inner objects
-    VoV vov("vov", 2, 3);
-    V a("a");
-    V b("b");
-    vov(0, 0) = a;
-    vov(1, 0) = a;
-    vov(0, 1) = b;
+  VoV vov("vov", 2, 3);
+  V a("a");
+  V b("b");
+  vov(0, 0) = a;
+  vov(1, 0) = a;
+  vov(0, 1) = b;
 #ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
-    vov(0, 0) = V();
-    vov(1, 0) = V();
-    vov(0, 1) = V();
+  vov(0, 0) = V();
+  vov(1, 0) = V();
+  vov(0, 1) = V();
 #endif
-  }
-  {  // using placement new to construct the inner objects and explicitly
-     // calling the destructor
-    VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3);
-    V a("a");
-    V b("b");
-    new (&vov(0, 0)) V(a);
-    new (&vov(1, 0)) V(a);
-    new (&vov(0, 1)) V(b);
-#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
-    vov(0, 0).~V();
-    vov(1, 0).~V();
-    vov(0, 1).~V();
-#else
-    // leaks memory
-#endif
-  }
 }

-TEST(TEST_CATEGORY, view_of_views) {
-  test_view_of_views<Kokkos::View<int, TEST_EXECSPACE>>();
-  test_view_of_views<Kokkos::View<int[4], TEST_EXECSPACE>>();
+template <class V>
+void test_view_of_views_without_initializing() {
+  // using placement new to construct the inner objects and explicitly
+  // calling the destructor
+  using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
+  VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3);
+  V a("a");
+  V b("b");
+  new (&vov(0, 0)) V(a);
+  new (&vov(1, 0)) V(a);
+  new (&vov(0, 1)) V(b);
+#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
+  vov(0, 0).~V();
+  vov(1, 0).~V();
+  vov(0, 1).~V();
+#else
+  // leaks memory
+#endif
+}
+
+template <class V>
+void test_view_of_views_sequential_host_init() {
+  // inner views value-initialized sequentially on the host, and also
+  // sequentially destructed on the host, without the need to cleanup
+  using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
+  VoV vov(Kokkos::view_alloc("vov", Kokkos::SequentialHostInit), 2, 3);
+  V a("a");
+  V b("b");
+  vov(0, 0) = a;
+  vov(1, 0) = a;
+  vov(0, 1) = b;
+}
+
+TEST(TEST_CATEGORY, view_of_views_default) {
+  test_view_of_views_default<Kokkos::View<int, TEST_EXECSPACE>>();
+  test_view_of_views_default<Kokkos::View<int[4], TEST_EXECSPACE>>();
  // User-defined type with View data member
-  test_view_of_views<S<Kokkos::View<float, TEST_EXECSPACE>>>();
+  test_view_of_views_default<S<Kokkos::View<float, TEST_EXECSPACE>>>();
+}
+
+TEST(TEST_CATEGORY, view_of_views_without_initializing) {
+  test_view_of_views_without_initializing<Kokkos::View<int, TEST_EXECSPACE>>();
+  test_view_of_views_without_initializing<
+      S<Kokkos::View<float, TEST_EXECSPACE>>>();
+  test_view_of_views_without_initializing<
+      N<Kokkos::View<double, TEST_EXECSPACE>>>();
+  test_view_of_views_without_initializing<
+      H<Kokkos::View<int, TEST_EXECSPACE>>>();
+}
+
+TEST(TEST_CATEGORY, test_view_of_views_sequential_host_init) {
+  test_view_of_views_sequential_host_init<Kokkos::View<int, TEST_EXECSPACE>>();
+  test_view_of_views_sequential_host_init<
+      S<Kokkos::View<float, TEST_EXECSPACE>>>();
+  test_view_of_views_sequential_host_init<
+      H<Kokkos::View<int, TEST_EXECSPACE>>>();
 }

 }  // namespace
--- a/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
+++ b/lib/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp
@ -39,9 +39,14 @@ TEST(cuda, space_access) {
      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
                                       Kokkos::CudaSpace>::assignable);

+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
  static_assert(
      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
                                       Kokkos::CudaSpace>::accessible);
+#else
+  static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
+                                                Kokkos::CudaSpace>::accessible);
+#endif

  static_assert(
      !Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
@ -147,8 +152,13 @@ TEST(cuda, space_access) {
      Kokkos::SpaceAccessibility<Kokkos::Cuda,
                                 Kokkos::CudaHostPinnedSpace>::accessible);

+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
  static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                            Kokkos::CudaSpace>::accessible);
+#else
+  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
+                                           Kokkos::CudaSpace>::accessible);
+#endif

  static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                           Kokkos::CudaUVMSpace>::accessible);
@ -157,8 +167,14 @@ TEST(cuda, space_access) {
      Kokkos::SpaceAccessibility<Kokkos::HostSpace,
                                 Kokkos::CudaHostPinnedSpace>::accessible);

+#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
  static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
                             Kokkos::HostSpace>::value);
+#else
+  static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
+                             Kokkos::Device<Kokkos::HostSpace::execution_space,
+                                            Kokkos::CudaSpace>>::value);
+#endif

  static_assert(
      std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,
--- a/lib/kokkos/master_history.txt
+++ b/lib/kokkos/master_history.txt
@ -38,3 +38,4 @@ tag:  4.2.01     date: 01:30:2024    master: 71a9bcae    release: 221e5f7a
 tag:  4.3.00     date: 04:03:2024    master: e0dc0128    release: f08217a4
 tag:  4.3.01     date: 05:07:2024    master: 486cc745    release: 262d2d6e
 tag:  4.4.00     date: 08:08:2024    master: 6ecdf605    release: 6068673c
+tag:  4.4.01     date: 09:12:2024    master: 08ceff92    release: 2d60c039
--- a/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp
+++ b/lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp
@ -361,9 +361,7 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> {
  };
  using value_type = bool;
  using abi_type   = simd_abi::avx2_fixed_size<4>;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask()                 = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
      : m_value(_mm_set1_epi32(-std::int32_t(value))) {}
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
@ -460,9 +458,7 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>> {
  };
  using value_type = bool;
  using abi_type   = simd_abi::avx2_fixed_size<8>;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask()                 = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
      : m_value(_mm256_set1_epi32(-std::int32_t(value))) {}
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
@ -561,9 +557,7 @@ class simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> {
  };
  using value_type = bool;
  using abi_type   = simd_abi::avx2_fixed_size<4>;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask()                 = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
-  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&)      = default;
+  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
      : m_value(_mm256_set1_epi64x(-std::int64_t(value))) {}
  KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
--- a/lib/kokkos/simd/unit_tests/TestSIMD.cpp
+++ b/lib/kokkos/simd/unit_tests/TestSIMD.cpp
@ -22,3 +22,4 @@
 #include <TestSIMD_GeneratorCtors.hpp>
 #include <TestSIMD_WhereExpressions.hpp>
 #include <TestSIMD_Reductions.hpp>
+#include <TestSIMD_Construction.hpp>
--- a/lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp
+++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp
@ -0,0 +1,150 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 4.0
+//       Copyright (2022) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
+// See https://kokkos.org/LICENSE for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//@HEADER
+
+#ifndef KOKKOS_TEST_SIMD_CONSTRUCTION_HPP
+#define KOKKOS_TEST_SIMD_CONSTRUCTION_HPP
+
+#include <Kokkos_SIMD.hpp>
+#include <SIMDTesting_Utilities.hpp>
+
+template <typename Abi, typename DataType>
+inline void host_test_simd_traits() {
+  using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
+
+  static_assert(std::is_nothrow_default_constructible_v<simd_type>);
+  static_assert(std::is_nothrow_copy_assignable_v<simd_type>);
+  static_assert(std::is_nothrow_copy_constructible_v<simd_type>);
+  static_assert(std::is_nothrow_move_assignable_v<simd_type>);
+  static_assert(std::is_nothrow_move_constructible_v<simd_type>);
+
+  simd_type default_simd, result;
+  simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
+  simd_type copy_simd(test_simd);
+  simd_type move_simd(std::move(copy_simd));
+  default_simd = std::move(move_simd);
+  result       = default_simd;
+  EXPECT_TRUE(all_of(test_simd == result));
+}
+
+template <typename Abi, typename DataType>
+inline void host_test_mask_traits() {
+  using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
+
+  static_assert(std::is_nothrow_default_constructible_v<mask_type>);
+  static_assert(std::is_nothrow_copy_assignable_v<mask_type>);
+  static_assert(std::is_nothrow_copy_constructible_v<mask_type>);
+  static_assert(std::is_nothrow_move_assignable_v<mask_type>);
+  static_assert(std::is_nothrow_move_constructible_v<mask_type>);
+
+  mask_type default_mask, result;
+  mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
+  mask_type copy_mask(test_mask);
+  mask_type move_mask(std::move(copy_mask));
+  default_mask = std::move(move_mask);
+  result       = default_mask;
+  EXPECT_EQ(test_mask, result);
+}
+
+template <typename Abi, typename DataType>
+inline void host_check_construction() {
+  if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) {
+    host_test_simd_traits<Abi, DataType>();
+    host_test_mask_traits<Abi, DataType>();
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+inline void host_check_construction_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (host_check_construction<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+inline void host_check_construction_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (host_check_construction_all_types<Abis>(DataTypes()), ...);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_test_simd_traits() {
+  using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
+
+  simd_type default_simd, result;
+  simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
+  simd_type copy_simd(test_simd);
+  simd_type move_simd(std::move(copy_simd));
+  default_simd = std::move(move_simd);
+  result       = default_simd;
+
+  kokkos_checker checker;
+  checker.truth(all_of(test_simd == result));
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_test_mask_traits() {
+  using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
+
+  mask_type default_mask, result;
+  mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
+  mask_type copy_mask(test_mask);
+  mask_type move_mask(std::move(copy_mask));
+  default_mask = std::move(move_mask);
+  result       = default_mask;
+
+  kokkos_checker checker;
+  checker.truth(test_mask == result);
+}
+
+template <typename Abi, typename DataType>
+KOKKOS_INLINE_FUNCTION void device_check_construction() {
+  if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) {
+    device_test_simd_traits<Abi, DataType>();
+    device_test_mask_traits<Abi, DataType>();
+  }
+}
+
+template <typename Abi, typename... DataTypes>
+KOKKOS_INLINE_FUNCTION void device_check_construction_all_types(
+    Kokkos::Experimental::Impl::data_types<DataTypes...>) {
+  (device_check_construction<Abi, DataTypes>(), ...);
+}
+
+template <typename... Abis>
+KOKKOS_INLINE_FUNCTION void device_check_construction_all_abis(
+    Kokkos::Experimental::Impl::abi_set<Abis...>) {
+  using DataTypes = Kokkos::Experimental::Impl::data_type_set;
+  (device_check_construction_all_types<Abis>(DataTypes()), ...);
+}
+
+class simd_device_construction_functor {
+ public:
+  KOKKOS_INLINE_FUNCTION void operator()(int) const {
+    device_check_construction_all_abis(
+        Kokkos::Experimental::Impl::device_abi_set());
+  }
+};
+
+TEST(simd, host_construction) {
+  host_check_construction_all_abis(Kokkos::Experimental::Impl::host_abi_set());
+}
+
+TEST(simd, device_construction) {
+  Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
+                       simd_device_construction_functor());
+}
+
+#endif