Update Kokkos library in LAMMPS to v4.4.1

This commit is contained in:
Stan Moore
2024-09-13 12:14:49 -06:00
parent 3079d51eaf
commit 487f7ade68
23 changed files with 470 additions and 80 deletions

View File

@ -1,5 +1,20 @@
# CHANGELOG
## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01)
### Features:
* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229)
### Backend and Architecture Enhancements:
#### CUDA:
* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823)
### Bug Fixes
* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284)
* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296)
## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00)

View File

@ -151,7 +151,7 @@ ENDIF()
set(Kokkos_VERSION_MAJOR 4)
set(Kokkos_VERSION_MINOR 4)
set(Kokkos_VERSION_PATCH 0)
set(Kokkos_VERSION_PATCH 1)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

View File

@ -12,7 +12,7 @@ endif
KOKKOS_VERSION_MAJOR = 4
KOKKOS_VERSION_MINOR = 4
KOKKOS_VERSION_PATCH = 0
KOKKOS_VERSION_PATCH = 1
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial

View File

@ -37,6 +37,7 @@
#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated
#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
#cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY

View File

@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
# resolved but we keep the option around a bit longer to be safe.
KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" )
KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP")
@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options)
ENDIF()
ENDFUNCTION()
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)

View File

@ -37,6 +37,17 @@
#endif
///@}
/// Some tests are skipped for unified memory space
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \
if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
Kokkos::CudaSpace>) \
GTEST_SKIP() << "skipping since unified memory requires additional " \
"fences";
#else
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
#endif
TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),

View File

@ -31,7 +31,6 @@
#include <algorithm>
#include <atomic>
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Tools.hpp>
@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id,
cudaError_t error_code = cudaSuccess;
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
// This is intended for Grace-Hopper (and future unified memory architectures)
// The idea is to use host allocator and then advise to keep it in HBM on the
// device, but that requires CUDA 12.2
static_assert(CUDART_VERSION >= 12020,
"CUDA runtime version >=12.2 required when "
"Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. "
"Please update your CUDA runtime version or "
"reconfigure with "
"-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF");
if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
if (error_code == cudaSuccess) {
// One would think cudaMemLocation{device_id,
// cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of
// members doesn't seem to be defined.
cudaMemLocation loc;
loc.id = device_id;
loc.type = cudaMemLocationTypeDevice;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
}
}
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id,
"Kokkos::Cuda: backend fence after async malloc");
}
}
} else
} else {
error_code = cudaMalloc(&ptr, arg_alloc_size);
}
#else
error_code = cudaMalloc(&ptr, arg_alloc_size);
#endif
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate(
}
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
Impl::cuda_device_synchronize(
@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
#include <impl/Kokkos_SharedAlloc_timpl.hpp>
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(

View File

@ -88,6 +88,19 @@ class CudaSpace {
void* allocate(const char* arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
return allocate(arg_alloc_size);
}
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const char* arg_label,
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const {
return allocate(arg_label, arg_alloc_size, arg_logical_size);
}
#endif
/**\brief Deallocate untracked memory in the cuda space */
void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
void deallocate(const char* arg_label, void* const arg_alloc_ptr,
@ -337,7 +350,11 @@ static_assert(
template <>
struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
enum : bool { assignable = false };
enum : bool { accessible = false };
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
enum : bool{accessible = false};
#else
enum : bool { accessible = true };
#endif
enum : bool { deepcopy = true };
};
@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);

View File

@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
//----------------------------------
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
// Check if unified memory is available
int cuda_result;
cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
cuda_device_id);
if (cuda_result == 0) {
Kokkos::abort(
"Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
"this device\n"
"Please recompile Kokkos with "
"-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
}
#endif
//----------------------------------
cudaStream_t singleton_stream;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
@ -705,6 +721,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
#else
os << "no\n";
#endif
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
os << " KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
os << "yes\n";
#endif
os << "\nCuda Runtime Configuration:\n";

View File

@ -571,6 +571,8 @@ inline constexpr Kokkos::ALL_t ALL{};
#pragma omp end declare target
#endif
inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{};
inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};
inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};

View File

@ -113,7 +113,7 @@ int OpenMP::impl_thread_pool_size() const noexcept {
}
int OpenMP::impl_max_hardware_threads() noexcept {
return Impl::g_openmp_hardware_max_threads;
return Impl::OpenMPInternal::max_hardware_threads();
}
namespace Impl {

View File

@ -31,12 +31,20 @@
#include <sstream>
#include <thread>
namespace {
int g_openmp_hardware_max_threads = 1;
}
namespace Kokkos {
namespace Impl {
std::vector<OpenMPInternal *> OpenMPInternal::all_instances;
std::mutex OpenMPInternal::all_instances_mutex;
int OpenMPInternal::max_hardware_threads() noexcept {
return g_openmp_hardware_max_threads;
}
void OpenMPInternal::clear_thread_data() {
const size_t member_bytes =
sizeof(int64_t) *
@ -188,9 +196,9 @@ void OpenMPInternal::initialize(int thread_count) {
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
Impl::g_openmp_hardware_max_threads = get_current_max_threads();
g_openmp_hardware_max_threads = get_current_max_threads();
int process_num_threads = Impl::g_openmp_hardware_max_threads;
int process_num_threads = g_openmp_hardware_max_threads;
if (Kokkos::hwloc::available()) {
process_num_threads = Kokkos::hwloc::get_available_numa_count() *
@ -203,11 +211,11 @@ void OpenMPInternal::initialize(int thread_count) {
// process_num_threads if thread_count > 0, set
// g_openmp_hardware_max_threads to thread_count
if (thread_count < 0) {
thread_count = Impl::g_openmp_hardware_max_threads;
thread_count = g_openmp_hardware_max_threads;
} else if (thread_count == 0) {
if (Impl::g_openmp_hardware_max_threads != process_num_threads) {
Impl::g_openmp_hardware_max_threads = process_num_threads;
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
if (g_openmp_hardware_max_threads != process_num_threads) {
g_openmp_hardware_max_threads = process_num_threads;
omp_set_num_threads(g_openmp_hardware_max_threads);
}
} else {
if (Kokkos::show_warnings() && thread_count > process_num_threads) {
@ -218,16 +226,16 @@ void OpenMPInternal::initialize(int thread_count) {
<< ", requested thread : " << std::setw(3) << thread_count
<< std::endl;
}
Impl::g_openmp_hardware_max_threads = thread_count;
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
g_openmp_hardware_max_threads = thread_count;
omp_set_num_threads(g_openmp_hardware_max_threads);
}
// setup thread local
#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
#pragma omp parallel num_threads(g_openmp_hardware_max_threads)
{ Impl::SharedAllocationRecord<void, void>::tracking_enable(); }
auto &instance = OpenMPInternal::singleton();
instance.m_pool_size = Impl::g_openmp_hardware_max_threads;
instance.m_pool_size = g_openmp_hardware_max_threads;
// New, unified host thread team data:
{
@ -272,10 +280,9 @@ void OpenMPInternal::finalize() {
if (this == &singleton()) {
auto const &instance = singleton();
// Silence Cuda Warning
const int nthreads =
instance.m_pool_size <= Impl::g_openmp_hardware_max_threads
? Impl::g_openmp_hardware_max_threads
: instance.m_pool_size;
const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads
? g_openmp_hardware_max_threads
: instance.m_pool_size;
(void)nthreads;
#pragma omp parallel num_threads(nthreads)
@ -284,7 +291,7 @@ void OpenMPInternal::finalize() {
// allow main thread to track
Impl::SharedAllocationRecord<void, void>::tracking_enable();
Impl::g_openmp_hardware_max_threads = 1;
g_openmp_hardware_max_threads = 1;
}
m_initialized = false;
@ -307,7 +314,7 @@ void OpenMPInternal::print_configuration(std::ostream &s) const {
if (m_initialized) {
const int numa_count = 1;
const int core_per_numa = Impl::g_openmp_hardware_max_threads;
const int core_per_numa = g_openmp_hardware_max_threads;
const int thread_per_core = 1;
s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa

View File

@ -47,8 +47,6 @@ namespace Impl {
class OpenMPInternal;
inline int g_openmp_hardware_max_threads = 1;
struct OpenMPTraits {
static constexpr int MAX_THREAD_COUNT = 512;
};
@ -86,6 +84,8 @@ class OpenMPInternal {
void clear_thread_data();
static int max_hardware_threads() noexcept;
int thread_pool_size() const { return m_pool_size; }
void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes,

View File

@ -105,7 +105,8 @@ class UniqueToken<OpenMP, UniqueTokenScope::Global> {
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int size() const noexcept {
KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;))
KOKKOS_IF_ON_HOST(
(return Kokkos::Impl::OpenMPInternal::max_hardware_threads();))
KOKKOS_IF_ON_DEVICE((return 0;))
}

View File

@ -313,6 +313,51 @@ struct ViewValueFunctor<DeviceType, ValueType, true /* is_scalar */> {
void destroy_shared_allocation() {}
};
template <class DeviceType, class ValueType>
struct ViewValueFunctorSequentialHostInit {
using ExecSpace = typename DeviceType::execution_space;
using MemSpace = typename DeviceType::memory_space;
static_assert(SpaceAccessibility<HostSpace, MemSpace>::accessible);
ValueType* ptr;
size_t n;
ViewValueFunctorSequentialHostInit() = default;
ViewValueFunctorSequentialHostInit(ExecSpace const& /*arg_space*/,
ValueType* const arg_ptr,
size_t const arg_n,
std::string /*arg_name*/)
: ptr(arg_ptr), n(arg_n) {}
ViewValueFunctorSequentialHostInit(ValueType* const arg_ptr,
size_t const arg_n,
std::string /*arg_name*/)
: ptr(arg_ptr), n(arg_n) {}
void construct_shared_allocation() {
if constexpr (std::is_trivial_v<ValueType>) {
// value-initialization is equivalent to filling with zeros
std::memset(static_cast<void*>(ptr), 0, n * sizeof(ValueType));
} else {
for (size_t i = 0; i < n; ++i) {
new (ptr + i) ValueType();
}
}
}
void destroy_shared_allocation() {
if constexpr (std::is_trivially_destructible_v<ValueType>) {
// do nothing, don't bother calling the destructor
} else {
for (size_t i = 0; i < n; ++i) {
(ptr + i)->~ValueType();
}
}
}
};
} // namespace Kokkos::Impl
#endif // KOKKOS_VIEW_ALLOC_HPP

View File

@ -23,12 +23,16 @@
namespace Kokkos {
namespace Impl {
struct SequentialHostInit_t {};
struct WithoutInitializing_t {};
struct AllowPadding_t {};
template <typename>
struct is_view_ctor_property : public std::false_type {};
template <>
struct is_view_ctor_property<SequentialHostInit_t> : public std::true_type {};
template <>
struct is_view_ctor_property<WithoutInitializing_t> : public std::true_type {};
@ -84,10 +88,10 @@ struct ViewCtorProp<void, CommonViewAllocProp<Specialize, T>> {
/* Property flags have constexpr value */
template <typename P>
struct ViewCtorProp<
std::enable_if_t<std::is_same<P, AllowPadding_t>::value ||
std::is_same<P, WithoutInitializing_t>::value>,
P> {
struct ViewCtorProp<std::enable_if_t<std::is_same_v<P, AllowPadding_t> ||
std::is_same_v<P, WithoutInitializing_t> ||
std::is_same_v<P, SequentialHostInit_t>>,
P> {
ViewCtorProp() = default;
ViewCtorProp(const ViewCtorProp &) = default;
ViewCtorProp &operator=(const ViewCtorProp &) = default;
@ -199,6 +203,11 @@ struct ViewCtorProp : public ViewCtorProp<void, P>... {
Kokkos::Impl::has_type<AllowPadding_t, P...>::value;
static constexpr bool initialize =
!Kokkos::Impl::has_type<WithoutInitializing_t, P...>::value;
static constexpr bool sequential_host_init =
Kokkos::Impl::has_type<SequentialHostInit_t, P...>::value;
static_assert(initialize || !sequential_host_init,
"Incompatible WithoutInitializing and SequentialHostInit view "
"alloc properties");
using memory_space = typename var_memory_space::type;
using execution_space = typename var_execution_space::type;
@ -251,7 +260,9 @@ auto with_properties_if_unset(const ViewCtorProp<P...> &view_ctor_prop,
(is_view_label<Property>::value &&
!ViewCtorProp<P...>::has_label) ||
(std::is_same_v<Property, WithoutInitializing_t> &&
ViewCtorProp<P...>::initialize)) {
ViewCtorProp<P...>::initialize) ||
(std::is_same_v<Property, SequentialHostInit_t> &&
!ViewCtorProp<P...>::sequential_host_init)) {
using NewViewCtorProp = ViewCtorProp<P..., Property>;
NewViewCtorProp new_view_ctor_prop(view_ctor_prop);
static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value =
@ -299,7 +310,9 @@ struct WithPropertiesIfUnset<ViewCtorProp<P...>, Property, Properties...> {
(is_view_label<Property>::value &&
!ViewCtorProp<P...>::has_label) ||
(std::is_same_v<Property, WithoutInitializing_t> &&
ViewCtorProp<P...>::initialize)) {
ViewCtorProp<P...>::initialize) ||
(std::is_same_v<Property, SequentialHostInit_t> &&
!ViewCtorProp<P...>::sequential_host_init)) {
using NewViewCtorProp = ViewCtorProp<P..., Property>;
NewViewCtorProp new_view_ctor_prop(view_ctor_prop);
static_cast<ViewCtorProp<void, Property> &>(new_view_ctor_prop).value =

View File

@ -2825,10 +2825,12 @@ class ViewMapping<
using memory_space = typename Traits::memory_space;
static_assert(
SpaceAccessibility<execution_space, memory_space>::accessible);
using value_type = typename Traits::value_type;
using functor_type =
ViewValueFunctor<Kokkos::Device<execution_space, memory_space>,
value_type>;
using device_type = Kokkos::Device<execution_space, memory_space>;
using value_type = typename Traits::value_type;
using functor_type = std::conditional_t<
alloc_prop::sequential_host_init,
ViewValueFunctorSequentialHostInit<device_type, value_type>,
ViewValueFunctor<device_type, value_type>>;
using record_type =
Kokkos::Impl::SharedAllocationRecord<memory_space, functor_type>;

View File

@ -20,7 +20,7 @@
namespace {
// User-defined type with a View data member
// User-defined types with a View data member
template <class V>
class S {
V v_;
@ -28,48 +28,102 @@ class S {
public:
template <class... Extents>
S(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
S() = default;
KOKKOS_DEFAULTED_FUNCTION S() = default;
};
template <class V>
void test_view_of_views() {
class N { // not default constructible
V v_;
public:
template <class... Extents>
N(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
};
template <class V>
class H { // constructible and destructible only from on the host side
V v_;
public:
template <class... Extents>
H(std::string label, Extents... extents) : v_(std::move(label), extents...) {}
H() {}
~H() {}
};
template <class V>
void test_view_of_views_default() {
// assigning a default-constructed view to destruct the inner objects
using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
{ // assigning a default-constructed view to destruct the inner objects
VoV vov("vov", 2, 3);
V a("a");
V b("b");
vov(0, 0) = a;
vov(1, 0) = a;
vov(0, 1) = b;
VoV vov("vov", 2, 3);
V a("a");
V b("b");
vov(0, 0) = a;
vov(1, 0) = a;
vov(0, 1) = b;
#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
vov(0, 0) = V();
vov(1, 0) = V();
vov(0, 1) = V();
vov(0, 0) = V();
vov(1, 0) = V();
vov(0, 1) = V();
#endif
}
{ // using placement new to construct the inner objects and explicitly
// calling the destructor
VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3);
V a("a");
V b("b");
new (&vov(0, 0)) V(a);
new (&vov(1, 0)) V(a);
new (&vov(0, 1)) V(b);
#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
vov(0, 0).~V();
vov(1, 0).~V();
vov(0, 1).~V();
#else
// leaks memory
#endif
}
}
TEST(TEST_CATEGORY, view_of_views) {
test_view_of_views<Kokkos::View<int, TEST_EXECSPACE>>();
test_view_of_views<Kokkos::View<int[4], TEST_EXECSPACE>>();
template <class V>
void test_view_of_views_without_initializing() {
// using placement new to construct the inner objects and explicitly
// calling the destructor
using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3);
V a("a");
V b("b");
new (&vov(0, 0)) V(a);
new (&vov(1, 0)) V(a);
new (&vov(0, 1)) V(b);
#ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND
vov(0, 0).~V();
vov(1, 0).~V();
vov(0, 1).~V();
#else
// leaks memory
#endif
}
template <class V>
void test_view_of_views_sequential_host_init() {
// inner views value-initialized sequentially on the host, and also
// sequentially destructed on the host, without the need to cleanup
using VoV = Kokkos::View<V**, Kokkos::HostSpace>;
VoV vov(Kokkos::view_alloc("vov", Kokkos::SequentialHostInit), 2, 3);
V a("a");
V b("b");
vov(0, 0) = a;
vov(1, 0) = a;
vov(0, 1) = b;
}
TEST(TEST_CATEGORY, view_of_views_default) {
test_view_of_views_default<Kokkos::View<int, TEST_EXECSPACE>>();
test_view_of_views_default<Kokkos::View<int[4], TEST_EXECSPACE>>();
// User-defined type with View data member
test_view_of_views<S<Kokkos::View<float, TEST_EXECSPACE>>>();
test_view_of_views_default<S<Kokkos::View<float, TEST_EXECSPACE>>>();
}
TEST(TEST_CATEGORY, view_of_views_without_initializing) {
test_view_of_views_without_initializing<Kokkos::View<int, TEST_EXECSPACE>>();
test_view_of_views_without_initializing<
S<Kokkos::View<float, TEST_EXECSPACE>>>();
test_view_of_views_without_initializing<
N<Kokkos::View<double, TEST_EXECSPACE>>>();
test_view_of_views_without_initializing<
H<Kokkos::View<int, TEST_EXECSPACE>>>();
}
TEST(TEST_CATEGORY, test_view_of_views_sequential_host_init) {
test_view_of_views_sequential_host_init<Kokkos::View<int, TEST_EXECSPACE>>();
test_view_of_views_sequential_host_init<
S<Kokkos::View<float, TEST_EXECSPACE>>>();
test_view_of_views_sequential_host_init<
H<Kokkos::View<int, TEST_EXECSPACE>>>();
}
} // namespace

View File

@ -39,9 +39,14 @@ TEST(cuda, space_access) {
!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
Kokkos::CudaSpace>::assignable);
#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
static_assert(
!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
Kokkos::CudaSpace>::accessible);
#else
static_assert(Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
Kokkos::CudaSpace>::accessible);
#endif
static_assert(
!Kokkos::Impl::MemorySpaceAccess<Kokkos::HostSpace,
@ -147,8 +152,13 @@ TEST(cuda, space_access) {
Kokkos::SpaceAccessibility<Kokkos::Cuda,
Kokkos::CudaHostPinnedSpace>::accessible);
#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
static_assert(!Kokkos::SpaceAccessibility<Kokkos::HostSpace,
Kokkos::CudaSpace>::accessible);
#else
static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
Kokkos::CudaSpace>::accessible);
#endif
static_assert(Kokkos::SpaceAccessibility<Kokkos::HostSpace,
Kokkos::CudaUVMSpace>::accessible);
@ -157,8 +167,14 @@ TEST(cuda, space_access) {
Kokkos::SpaceAccessibility<Kokkos::HostSpace,
Kokkos::CudaHostPinnedSpace>::accessible);
#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
Kokkos::HostSpace>::value);
#else
static_assert(std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaSpace>::Space,
Kokkos::Device<Kokkos::HostSpace::execution_space,
Kokkos::CudaSpace>>::value);
#endif
static_assert(
std::is_same<Kokkos::Impl::HostMirror<Kokkos::CudaUVMSpace>::Space,

View File

@ -38,3 +38,4 @@ tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a
tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4
tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e
tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c
tag: 4.4.01 date: 09:12:2024 master: 08ceff92 release: 2d60c039

View File

@ -361,9 +361,7 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<4>> {
};
using value_type = bool;
using abi_type = simd_abi::avx2_fixed_size<4>;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
: m_value(_mm_set1_epi32(-std::int32_t(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
@ -460,9 +458,7 @@ class simd_mask<std::int32_t, simd_abi::avx2_fixed_size<8>> {
};
using value_type = bool;
using abi_type = simd_abi::avx2_fixed_size<8>;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
: m_value(_mm256_set1_epi32(-std::int32_t(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {
@ -561,9 +557,7 @@ class simd_mask<std::int64_t, simd_abi::avx2_fixed_size<4>> {
};
using value_type = bool;
using abi_type = simd_abi::avx2_fixed_size<4>;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default;
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value)
: m_value(_mm256_set1_epi64x(-std::int64_t(value))) {}
KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() {

View File

@ -22,3 +22,4 @@
#include <TestSIMD_GeneratorCtors.hpp>
#include <TestSIMD_WhereExpressions.hpp>
#include <TestSIMD_Reductions.hpp>
#include <TestSIMD_Construction.hpp>

View File

@ -0,0 +1,150 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#ifndef KOKKOS_TEST_SIMD_CONSTRUCTION_HPP
#define KOKKOS_TEST_SIMD_CONSTRUCTION_HPP
#include <Kokkos_SIMD.hpp>
#include <SIMDTesting_Utilities.hpp>
template <typename Abi, typename DataType>
inline void host_test_simd_traits() {
using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
static_assert(std::is_nothrow_default_constructible_v<simd_type>);
static_assert(std::is_nothrow_copy_assignable_v<simd_type>);
static_assert(std::is_nothrow_copy_constructible_v<simd_type>);
static_assert(std::is_nothrow_move_assignable_v<simd_type>);
static_assert(std::is_nothrow_move_constructible_v<simd_type>);
simd_type default_simd, result;
simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
simd_type copy_simd(test_simd);
simd_type move_simd(std::move(copy_simd));
default_simd = std::move(move_simd);
result = default_simd;
EXPECT_TRUE(all_of(test_simd == result));
}
template <typename Abi, typename DataType>
inline void host_test_mask_traits() {
using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
static_assert(std::is_nothrow_default_constructible_v<mask_type>);
static_assert(std::is_nothrow_copy_assignable_v<mask_type>);
static_assert(std::is_nothrow_copy_constructible_v<mask_type>);
static_assert(std::is_nothrow_move_assignable_v<mask_type>);
static_assert(std::is_nothrow_move_constructible_v<mask_type>);
mask_type default_mask, result;
mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
mask_type copy_mask(test_mask);
mask_type move_mask(std::move(copy_mask));
default_mask = std::move(move_mask);
result = default_mask;
EXPECT_EQ(test_mask, result);
}
template <typename Abi, typename DataType>
inline void host_check_construction() {
if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) {
host_test_simd_traits<Abi, DataType>();
host_test_mask_traits<Abi, DataType>();
}
}
template <typename Abi, typename... DataTypes>
inline void host_check_construction_all_types(
Kokkos::Experimental::Impl::data_types<DataTypes...>) {
(host_check_construction<Abi, DataTypes>(), ...);
}
template <typename... Abis>
inline void host_check_construction_all_abis(
Kokkos::Experimental::Impl::abi_set<Abis...>) {
using DataTypes = Kokkos::Experimental::Impl::data_type_set;
(host_check_construction_all_types<Abis>(DataTypes()), ...);
}
template <typename Abi, typename DataType>
KOKKOS_INLINE_FUNCTION void device_test_simd_traits() {
using simd_type = Kokkos::Experimental::simd<DataType, Abi>;
simd_type default_simd, result;
simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
simd_type copy_simd(test_simd);
simd_type move_simd(std::move(copy_simd));
default_simd = std::move(move_simd);
result = default_simd;
kokkos_checker checker;
checker.truth(all_of(test_simd == result));
}
template <typename Abi, typename DataType>
KOKKOS_INLINE_FUNCTION void device_test_mask_traits() {
using mask_type = Kokkos::Experimental::simd_mask<DataType, Abi>;
mask_type default_mask, result;
mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); });
mask_type copy_mask(test_mask);
mask_type move_mask(std::move(copy_mask));
default_mask = std::move(move_mask);
result = default_mask;
kokkos_checker checker;
checker.truth(test_mask == result);
}
template <typename Abi, typename DataType>
KOKKOS_INLINE_FUNCTION void device_check_construction() {
if constexpr (is_type_v<Kokkos::Experimental::simd<DataType, Abi>>) {
device_test_simd_traits<Abi, DataType>();
device_test_mask_traits<Abi, DataType>();
}
}
template <typename Abi, typename... DataTypes>
KOKKOS_INLINE_FUNCTION void device_check_construction_all_types(
Kokkos::Experimental::Impl::data_types<DataTypes...>) {
(device_check_construction<Abi, DataTypes>(), ...);
}
template <typename... Abis>
KOKKOS_INLINE_FUNCTION void device_check_construction_all_abis(
Kokkos::Experimental::Impl::abi_set<Abis...>) {
using DataTypes = Kokkos::Experimental::Impl::data_type_set;
(device_check_construction_all_types<Abis>(DataTypes()), ...);
}
class simd_device_construction_functor {
public:
KOKKOS_INLINE_FUNCTION void operator()(int) const {
device_check_construction_all_abis(
Kokkos::Experimental::Impl::device_abi_set());
}
};
TEST(simd, host_construction) {
host_check_construction_all_abis(Kokkos::Experimental::Impl::host_abi_set());
}
TEST(simd, device_construction) {
Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::IndexType<int>>(0, 1),
simd_device_construction_functor());
}
#endif