diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index bdbc75604b..34df76a0fd 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,4 +1,25 @@ -# Change Log +# CHANGELOG + +## [3.7.02](https://github.com/kokkos/kokkos/tree/3.7.02) (2023-05-17) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.01...3.7.02) + +### Backends and Archs Enhancements: +#### CUDA +- Add Hopper support and update nvcc_wrapper to work with CUDA-12 [\#5693](https://github.com/kokkos/kokkos/pull/5693) +### General Enhancements: +- sprintf -> snprintf [\#5787](https://github.com/kokkos/kokkos/pull/5787) +### Build System: +- Add error message when not using `hipcc` and when `CMAKE_CXX_STANDARD` is not set [\#5945](https://github.com/kokkos/kokkos/pull/5945) +### Bug Fixes: +- Fix Scratch allocation alignment issues [\#5692](https://github.com/kokkos/kokkos/pull/5692) +- Fix Intel Classic Compiler ICE [\#5710](https://github.com/kokkos/kokkos/pull/5710) +- Don't install std algorithm headers multiple times [\#5711](https://github.com/kokkos/kokkos/pull/5711) +- Fix static init order issue in InitalizationSettings [\#5721](https://github.com/kokkos/kokkos/pull/5721) +- Fix src/dst Properties in deep_copy(DynamicView,View) [\#5732](https://github.com/kokkos/kokkos/pull/5732) +- Fix build on Fedora Rawhide [\#5782](https://github.com/kokkos/kokkos/pull/5782) +- Finalize HIP lock arrays [\#5694](https://github.com/kokkos/kokkos/pull/5694) +- Fix CUDA lock arrays for current Desul [\#5812](https://github.com/kokkos/kokkos/pull/5812) +- Set the correct device/context in InterOp tests [\#5701](https://github.com/kokkos/kokkos/pull/5701) ## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 7b78f29d73..404aad8065 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -129,7 +129,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MINOR 7) -set(Kokkos_VERSION_PATCH 01) +set(Kokkos_VERSION_PATCH 02) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index b873fd1e06..361995d847 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MINOR = 7 -KOKKOS_VERSION_PATCH = 01 +KOKKOS_VERSION_PATCH = 02 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt index 597626b111..606d83d18b 100644 --- a/lib/kokkos/algorithms/src/CMakeLists.txt +++ b/lib/kokkos/algorithms/src/CMakeLists.txt @@ -25,7 +25,7 @@ INSTALL ( # These will get ignored for standalone CMake and a true interface library made KOKKOS_ADD_INTERFACE_LIBRARY( kokkosalgorithms - HEADERS ${ALGO_HEADERS} + NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES} ) KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index f20a91e42f..3aff6857a3 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -214,6 +214,9 @@ GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) IF(KOKKOS_ENABLE_HIP) SET(AMDGPU_ARCH_FLAG "--offload-arch") IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + IF(KOKKOS_CXX_STANDARD STREQUAL 14 AND NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Set CMAKE_CXX_STANDARD to 14") + ENDIF() GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) IF(DEFINED ENV{ROCM_PATH}) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 34e45ecf72..9a55d2a5e9 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -534,13 +534,6 @@ FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) IF (KOKKOS_HAS_TRILINOS) TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) ELSE() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "HEADERS;SOURCES" - ${ARGN} - ) - ADD_LIBRARY(${NAME} INTERFACE) KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) ENDIF() diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index a2b68064de..8450c06077 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -915,8 +915,8 @@ inline void deep_copy(const View& dst, template inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, const View& src) { - using dst_type = Kokkos::Experimental::DynamicView; - using src_type = View; + using dst_type = Kokkos::Experimental::DynamicView; + using src_type = View; using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index 5345f8ea24..303e3643c9 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -240,6 +240,83 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); #endif } // end scope + + // Test: Reproducer to demonstrate compile-time error of deep_copy + // of DynamicView to/from on-host View. + // Case 4: + { + using device_view_type = Kokkos::View; + using host_view_type = typename Kokkos::View::HostMirror; + + view_type device_dynamic_view("on-device DynamicView", 1024, + arg_total_size); + device_view_type device_view("on-device View", arg_total_size); + host_view_type host_view("on-host View", arg_total_size); + + unsigned da_size = arg_total_size / 8; + device_dynamic_view.resize_serial(da_size); + + // Use parallel_for to populate device_dynamic_view and verify values +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy(0, da_size), + KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); + + value_type result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, da_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)device_dynamic_view(i); + }, + result_sum); + + ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); +#endif + + // Use an on-device View as intermediate to deep_copy the + // device_dynamic_view to host, zero out the device_dynamic_view, + // deep_copy from host back to the device_dynamic_view and verify + Kokkos::deep_copy(device_view, device_dynamic_view); + Kokkos::deep_copy(host_view, device_view); + Kokkos::deep_copy(device_view, host_view); +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + Kokkos::parallel_for( + Kokkos::RangePolicy(0, da_size), + KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); +#endif + Kokkos::deep_copy(device_dynamic_view, device_view); +#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, da_size), + KOKKOS_LAMBDA(const int i, value_type& partial_sum) { + partial_sum += (value_type)device_dynamic_view(i); + }, + new_result_sum); + + ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); +#endif + + // Try to deep_copy device_dynamic_view directly to/from host. + // host-to-device currently fails to compile because DP and SP are + // swapped in the deep_copy implementation. + // Once that's fixed, both deep_copy's will fail at runtime because the + // destination execution space cannot access the source memory space. + try { + Kokkos::deep_copy(host_view, device_dynamic_view); + } catch (std::runtime_error const& error) { + std::string msg = error.what(); + std::cerr << "Copy from on-device DynamicView to on-host View failed:\n" + << msg << std::endl; + } + try { + Kokkos::deep_copy(device_dynamic_view, host_view); + } catch (std::runtime_error const& error) { + std::string msg = error.what(); + std::cerr << "Copy from on-host View to on-device DynamicView failed:\n" + << msg << std::endl; + } + } } }; diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp index 54824e5b39..094c40bfa7 100644 --- a/lib/kokkos/core/perf_test/test_atomic.cpp +++ b/lib/kokkos/core/perf_test/test_atomic.cpp @@ -73,7 +73,7 @@ void textcolor(int attr, int fg, int bg) { char command[40]; /* Command is the control command to the terminal */ - sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); + snprintf(command, 40, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40); printf("%s", command); } void textcolor_standard() { textcolor(RESET, BLACK, WHITE); } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 3785cfe80b..bdba322fcc 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -428,6 +428,8 @@ void HIPInternal::finalize() { if (this == &singleton()) { (void)Kokkos::Impl::hip_global_unique_token_locks(true); + Kokkos::Impl::finalize_host_hip_lock_arrays(); + KOKKOS_IMPL_HIP_SAFE_CALL(hipHostFree(constantMemHostStaging)); KOKKOS_IMPL_HIP_SAFE_CALL(hipEventDestroy(constantMemReusable)); } diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index e42944d819..8d2aa4d188 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -228,11 +228,6 @@ #define KOKKOS_ENABLE_PRAGMA_SIMD 1 #endif -// FIXME Workaround for ICE with intel 17,18,19,20,21 in Trilinos -#if (KOKKOS_COMPILER_INTEL <= 2100) -#define KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS -#endif - // FIXME_SYCL #if !defined(KOKKOS_ENABLE_SYCL) #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 @@ -653,7 +648,8 @@ static constexpr bool kokkos_omp_on_host() { return false; } #if (defined(KOKKOS_COMPILER_GNU) || defined(KOKKOS_COMPILER_CLANG) || \ defined(KOKKOS_COMPILER_INTEL) || defined(KOKKOS_COMPILER_PGI)) && \ !defined(_WIN32) -#if (!defined(__linux__) || defined(__GLIBC_MINOR__)) +// disable stacktrace for musl-libc +#if !defined(__linux__) || defined(__GLIBC_MINOR__) #define KOKKOS_IMPL_ENABLE_STACKTRACE #endif #define KOKKOS_IMPL_ENABLE_CXXABI diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index 3e37eb61dc..a1d77071f5 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -73,9 +73,8 @@ class ScratchMemorySpace { "Instantiating ScratchMemorySpace on non-execution-space type."); public: - // Alignment of memory chunks returned by 'get' - // must be a power of two - enum { ALIGN = 8 }; + // Minimal overalignment used by view scratch allocations + constexpr static int ALIGN = 8; private: mutable char* m_iter_L0 = nullptr; @@ -87,7 +86,9 @@ class ScratchMemorySpace { mutable int m_offset = 0; mutable int m_default_level = 0; - enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + constexpr static int DEFAULT_ALIGNMENT_MASK = ALIGN - 1; +#endif public: //! Tag this class as a memory space @@ -101,39 +102,59 @@ class ScratchMemorySpace { static constexpr const char* name() { return "ScratchMemorySpace"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + // This function is unused template - KOKKOS_INLINE_FUNCTION static IntType align(const IntType& size) { - return (size + MASK) & ~MASK; + KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static constexpr IntType align( + const IntType& size) { + return (size + DEFAULT_ALIGNMENT_MASK) & ~DEFAULT_ALIGNMENT_MASK; } +#endif template KOKKOS_INLINE_FUNCTION void* get_shmem(const IntType& size, int level = -1) const { - return get_shmem_common(size, 1, level); + return get_shmem_common(size, 1, level); } template KOKKOS_INLINE_FUNCTION void* get_shmem_aligned(const IntType& size, const ptrdiff_t alignment, int level = -1) const { - return get_shmem_common(size, alignment, level); + return get_shmem_common(size, alignment, + level); } private: - template + template KOKKOS_INLINE_FUNCTION void* get_shmem_common(const IntType& size, const ptrdiff_t alignment, int level = -1) const { if (level == -1) level = m_default_level; - auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1; - auto& m_end = (level == 0) ? m_end_L0 : m_end_L1; - char* previous = m_iter; - const ptrdiff_t missalign = size_t(m_iter) % alignment; - if (missalign) m_iter += alignment - missalign; + auto& m_iter = (level == 0) ? m_iter_L0 : m_iter_L1; + auto& m_end = (level == 0) ? m_end_L0 : m_end_L1; - void* tmp = m_iter + m_offset * (aligned ? size : align(size)); - if (m_end < (m_iter += (aligned ? size : align(size)) * m_multiplier)) { - m_iter = previous; // put it back like it was + if (alignment_requested) { + const ptrdiff_t missalign = size_t(m_iter) % alignment; + if (missalign) m_iter += alignment - missalign; + } + + // This is each thread's start pointer for its allocation + // Note: for team scratch m_offset is 0, since every + // thread will get back the same shared pointer + void* tmp = m_iter + m_offset * size; + ptrdiff_t increment = size * m_multiplier; + + // increment m_iter first and decrement it again if not + // enough memory was available. In the non-failing path + // this will save instructions. + m_iter += increment; + + if (m_end < m_iter) { + // Request did overflow: reset the base team ptr, and + // return nullptr + m_iter -= increment; + tmp = nullptr; #ifdef KOKKOS_ENABLE_DEBUG // mfh 23 Jun 2015: printf call consumes 25 registers // in a CUDA build, so only print in debug mode. The @@ -143,7 +164,6 @@ class ScratchMemorySpace { "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), long(m_end - m_iter)); #endif // KOKKOS_ENABLE_DEBUG - tmp = nullptr; } return tmp; } diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index ffdd1e9fc8..b0d1d693fc 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -203,19 +203,11 @@ class Serial { static const char* name(); Impl::SerialInternal* impl_internal_space_instance() const { -#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS - return m_space_instance; -#else return m_space_instance.get(); -#endif } private: -#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS - Impl::SerialInternal* m_space_instance; -#else Kokkos::Impl::HostSharedPtr m_space_instance; -#endif //-------------------------------------------------------------------------- }; diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index f8dcfc869e..795ef91e6a 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -67,6 +67,8 @@ KOKKOS_IMPL_WARNING("Including non-public Kokkos header files is not allowed.") #include +#include + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -1692,19 +1694,27 @@ class View : public ViewTraits { arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); } + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + ::Kokkos::max(::Kokkos::max(sizeof(typename traits::value_type), + alignof(typename traits::value_type)), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)); + + public: static KOKKOS_INLINE_FUNCTION size_t shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + - sizeof(typename traits::value_type); + return map_type::memory_span(arg_layout) + scratch_value_alignment; } explicit KOKKOS_INLINE_FUNCTION View( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(arg_layout), - sizeof(typename traits::value_type)))), + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), arg_layout) {} explicit KOKKOS_INLINE_FUNCTION View( @@ -1722,7 +1732,7 @@ class View : public ViewTraits { map_type::memory_span(typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)), - sizeof(typename traits::value_type)))), + scratch_value_alignment))), typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7), check_input_args::yes) { diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp index 9205e82560..25d220e45d 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -173,14 +173,8 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, } // namespace Impl Serial::Serial() -#ifdef KOKKOS_IMPL_WORKAROUND_ICE_IN_TRILINOS_WITH_OLD_INTEL_COMPILERS - : m_space_instance(&Impl::SerialInternal::singleton()) { -} -#else : m_space_instance(&Impl::SerialInternal::singleton(), - [](Impl::SerialInternal*) {}) { -} -#endif + [](Impl::SerialInternal*) {}) {} void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index d726a86f76..c3e28c59f4 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -63,11 +63,10 @@ class ParallelFor, const FunctorType m_functor; const MDRangePolicy m_mdr_policy; - const Policy m_policy; void exec() const { - const typename Policy::member_type e = m_policy.end(); - for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + const typename Policy::member_type e = m_mdr_policy.m_num_tiles; + for (typename Policy::member_type i = 0; i < e; ++i) { iterate_type(m_mdr_policy, m_functor)(i); } } @@ -85,9 +84,7 @@ class ParallelFor, } inline ParallelFor(const FunctorType& arg_functor, const MDRangePolicy& arg_policy) - : m_functor(arg_functor), - m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {} + : m_functor(arg_functor), m_mdr_policy(arg_policy) {} }; template @@ -120,13 +117,12 @@ class ParallelReduce, ReducerType, const FunctorType m_functor; const MDRangePolicy m_mdr_policy; - const Policy m_policy; const ReducerType m_reducer; const pointer_type m_result_ptr; inline void exec(reference_type update) const { - const typename Policy::member_type e = m_policy.end(); - for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { + const typename Policy::member_type e = m_mdr_policy.m_num_tiles; + for (typename Policy::member_type i = 0; i < e; ++i) { iterate_type(m_mdr_policy, m_functor, update)(i); } } @@ -148,7 +144,8 @@ class ParallelReduce, ReducerType, const size_t team_shared_size = 0; // Never shrinks const size_t thread_local_size = 0; // Never shrinks - auto* internal_instance = m_policy.space().impl_internal_space_instance(); + auto* internal_instance = + m_mdr_policy.space().impl_internal_space_instance(); // Need to lock resize_thread_team_data std::lock_guard lock( internal_instance->m_thread_team_data_mutex); @@ -181,7 +178,6 @@ class ParallelReduce, ReducerType, void*> = nullptr) : m_functor(arg_functor), m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), m_reducer(InvalidType()), m_result_ptr(arg_result_view.data()) { static_assert(Kokkos::is_view::value, @@ -197,7 +193,6 @@ class ParallelReduce, ReducerType, MDRangePolicy arg_policy, const ReducerType& reducer) : m_functor(arg_functor), m_mdr_policy(arg_policy), - m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)), m_reducer(reducer), m_result_ptr(reducer.view().data()) { /*static_assert( std::is_same< typename ViewType::memory_space diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index a5bd003237..fc89e485fd 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -1165,6 +1165,5 @@ void _kokkos_pgi_compiler_bug_workaround() {} #endif } // namespace Kokkos -Kokkos::Impl::InitializationSettingsHelper::storage_type const - Kokkos::Impl::InitializationSettingsHelper::unspecified = - "some string we don't expect user would ever provide"; +constexpr char + Kokkos::Impl::InitializationSettingsHelper::unspecified[]; diff --git a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp index ceb35f0247..00b2335fd7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp @@ -104,7 +104,9 @@ struct InitializationSettingsHelper { using value_type = std::string; using storage_type = std::string; - static storage_type const unspecified; + // prefer c-string to avoid static initialization order nightmare + static constexpr char unspecified[] = + "some string we don't expect user would ever provide"; }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp index 480b1a392b..796552cead 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling.cpp @@ -655,9 +655,9 @@ void initialize(const std::string& profileLibrary) { char* envProfileLibrary = const_cast(profileLibrary.c_str()); - const auto envProfileCopy = - std::make_unique(strlen(envProfileLibrary) + 1); - sprintf(envProfileCopy.get(), "%s", envProfileLibrary); + const size_t envProfileLen = strlen(envProfileLibrary) + 1; + const auto envProfileCopy = std::make_unique(envProfileLen); + snprintf(envProfileCopy.get(), envProfileLen, "%s", envProfileLibrary); char* profileLibraryName = strtok(envProfileCopy.get(), ";"); diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp index d915b7e472..55a29f1128 100644 --- a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp +++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.hpp @@ -69,9 +69,10 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, nargs = (do_threads ? 1 : 0) + (do_numa ? 1 : 0) + (do_device ? 1 : 0) + (do_other ? 4 : 0) + (do_tune ? 1 : 0); - char** args_kokkos = new char*[nargs]; + char** args_kokkos = new char*[nargs]; + const int max_args_size = 45; for (int i = 0; i < nargs; i++) { - args_kokkos[i] = new char[45]; + args_kokkos[i] = new char[max_args_size]; delete_these.insert(args_kokkos[i]); } @@ -112,7 +113,7 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, #endif init_args.num_threads = nthreads; - sprintf(args_kokkos[threads_idx], "--threads=%i", nthreads); + snprintf(args_kokkos[threads_idx], max_args_size, "--threads=%i", nthreads); } if (do_numa) { @@ -130,24 +131,27 @@ char** init_kokkos_args(bool do_threads, bool do_numa, bool do_device, #endif init_args.num_numa = numa; - sprintf(args_kokkos[numa_idx], "--numa=%i", numa); + snprintf(args_kokkos[numa_idx], max_args_size, "--numa=%i", numa); } if (do_device) { init_args.device_id = 0; - sprintf(args_kokkos[device_idx], "--device-id=%i", 0); + snprintf(args_kokkos[device_idx], max_args_size, "--device-id=%i", 0); } if (do_other) { - sprintf(args_kokkos[0], "--dummyarg=1"); - sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], "--dummy2arg"); - sprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], "dummy3arg"); - sprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], "dummy4arg=1"); + snprintf(args_kokkos[0], max_args_size, "--dummyarg=1"); + snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0)], max_args_size, + "--dummy2arg"); + snprintf(args_kokkos[threads_idx + (do_threads ? 1 : 0) + 1], max_args_size, + "dummy3arg"); + snprintf(args_kokkos[device_idx + (do_device ? 1 : 0)], max_args_size, + "dummy4arg=1"); } if (do_tune) { init_args.tune_internals = true; - sprintf(args_kokkos[tune_idx], "--kokkos-tune-internals"); + snprintf(args_kokkos[tune_idx], max_args_size, "--kokkos-tune-internals"); } return args_kokkos; diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp index f66b35dc9f..18f3c9a777 100644 --- a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp +++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp @@ -91,7 +91,7 @@ void test_shared_alloc() { // Since always executed on host space, leave [=] Kokkos::parallel_for(range, [=](int i) { char name[64]; - sprintf(name, "test_%.2d", i); + snprintf(name, 64, "test_%.2d", i); r[i] = RecordMemS::allocate(s, name, size * (i + 1)); h[i] = Header::get_header(r[i]->data()); @@ -135,7 +135,7 @@ void test_shared_alloc() { Kokkos::parallel_for(range, [=](size_t i) { char name[64]; - sprintf(name, "test_%.2d", int(i)); + snprintf(name, 64, "test_%.2d", int(i)); RecordFull* rec = RecordFull::allocate(s, name, size * (i + 1)); diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp index 3f05b2ef66..58db4360e5 100644 --- a/lib/kokkos/core/unit_test/TestTeam.hpp +++ b/lib/kokkos/core/unit_test/TestTeam.hpp @@ -1551,14 +1551,16 @@ struct TestScratchAlignment { double x, y, z; }; TestScratchAlignment() { - test(true); - test(false); + test_view(true); + test_view(false); + test_minimal(); + test_raw(); } using ScratchView = Kokkos::View; using ScratchViewInt = Kokkos::View; - void test(bool allocate_small) { + void test_view(bool allocate_small) { int shmem_size = ScratchView::shmem_size(11); #ifdef KOKKOS_ENABLE_OPENMPTARGET int team_size = @@ -1580,12 +1582,68 @@ struct TestScratchAlignment { }); Kokkos::fence(); } + + void test_minimal() { + using member_type = typename Kokkos::TeamPolicy::member_type; + Kokkos::TeamPolicy policy(1, 1); + size_t scratch_size = sizeof(int); + Kokkos::View flag("Flag"); + + Kokkos::parallel_for( + policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)), + KOKKOS_LAMBDA(const member_type &team) { + int *scratch_ptr = (int *)team.team_shmem().get_shmem(scratch_size); + if (scratch_ptr == nullptr) flag() = 1; + }); + Kokkos::fence(); + int minimal_scratch_allocation_failed = 0; + Kokkos::deep_copy(minimal_scratch_allocation_failed, flag); + ASSERT_TRUE(minimal_scratch_allocation_failed == 0); + } + + void test_raw() { + using member_type = typename Kokkos::TeamPolicy::member_type; + Kokkos::TeamPolicy policy(1, 1); + Kokkos::View flag("Flag"); + + Kokkos::parallel_for( + policy.set_scratch_size(0, Kokkos::PerTeam(1024)), + KOKKOS_LAMBDA(const member_type &team) { + int *scratch_ptr1 = (int *)team.team_shmem().get_shmem(24); + int *scratch_ptr2 = (int *)team.team_shmem().get_shmem(32); + int *scratch_ptr3 = (int *)team.team_shmem().get_shmem(12); + + if ((int(scratch_ptr2 - scratch_ptr1) != 6) || + (int(scratch_ptr3 - scratch_ptr2) != 8)) + flag() = 1; + + if (((scratch_ptr3 - static_cast(nullptr)) + 3) % 2 == 1) + scratch_ptr1 = (int *)team.team_shmem().get_shmem_aligned(24, 4); + else { + scratch_ptr1 = (int *)team.team_shmem().get_shmem_aligned(12, 4); + } + scratch_ptr2 = (int *)team.team_shmem().get_shmem_aligned(32, 8); + scratch_ptr3 = (int *)team.team_shmem().get_shmem_aligned(8, 4); + + if ((int(scratch_ptr2 - scratch_ptr1) != 7) && + (int(scratch_ptr2 - scratch_ptr1) != 4)) + flag() = 1; + if (int(scratch_ptr3 - scratch_ptr2) != 8) flag() = 1; + if ((int(size_t(scratch_ptr1) % 4) != 0) || + (int(size_t(scratch_ptr2) % 8) != 0) || + (int(size_t(scratch_ptr3) % 4) != 0)) + flag() = 1; + }); + Kokkos::fence(); + int raw_get_shmem_alignment_failed = 0; + Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag); + ASSERT_TRUE(raw_get_shmem_alignment_failed == 0); + } }; } // namespace namespace { - template struct TestTeamPolicyHandleByValue { using scalar = double; diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp index 31fd63f084..c4381b29e7 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Init.cpp @@ -59,9 +59,11 @@ __global__ void offset(int* p) { // Test whether allocations survive Kokkos initialize/finalize if done via Raw // Cuda. TEST(cuda, raw_cuda_interop) { + // Make sure that we use the same device for all allocations + Kokkos::initialize(); + int* p; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMalloc(&p, sizeof(int) * 100)); - Kokkos::initialize(); Kokkos::View> v(p, 100); Kokkos::deep_copy(v, 5); diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp index f11f657e00..69aef6f8d8 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_InterOp_Streams.cpp @@ -48,9 +48,11 @@ namespace Test { // Test Interoperability with Cuda Streams TEST(cuda, raw_cuda_streams) { + // Make sure that we use the same device for all allocations + Kokkos::initialize(); + cudaStream_t stream; cudaStreamCreate(&stream); - Kokkos::initialize(); int* p; cudaMalloc(&p, sizeof(int) * 100); using MemorySpace = typename TEST_EXECSPACE::memory_space; diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp index af20e753d4..ce76076f9b 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp +++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Init.cpp @@ -59,9 +59,11 @@ __global__ void offset(int* p) { // Test whether allocations survive Kokkos initialize/finalize if done via Raw // HIP. TEST(hip, raw_hip_interop) { + // Make sure that we use the same device for all allocations + Kokkos::initialize(); + int* p; KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100)); - Kokkos::initialize(); Kokkos::View> v(p, 100); Kokkos::deep_copy(v, 5); diff --git a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp index 95d102d4d1..6c7fdd7044 100644 --- a/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp +++ b/lib/kokkos/core/unit_test/hip/TestHIP_InterOp_Streams.cpp @@ -50,9 +50,11 @@ namespace Test { // The difference with the CUDA tests are: raw HIP vs raw CUDA and no launch // bound in HIP due to an error when computing the block size. TEST(hip, raw_hip_streams) { + // Make sure that we use the same device for all allocations + Kokkos::initialize(); + hipStream_t stream; KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); - Kokkos::initialize(); int* p; KOKKOS_IMPL_HIP_SAFE_CALL(hipMalloc(&p, sizeof(int) * 100)); using MemorySpace = typename TEST_EXECSPACE::memory_space; diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp index e45d990745..1189aba26f 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp +++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init.cpp @@ -52,8 +52,8 @@ namespace Test { // Test whether allocations survive Kokkos initialize/finalize if done via Raw // SYCL. TEST(sycl, raw_sycl_interop) { + // Make sure all queues use the same context Kokkos::initialize(); - Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp index 114d2a4aa2..7b9a664304 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp +++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Init_Context.cpp @@ -51,6 +51,7 @@ namespace Test { // Test whether external allocations can be accessed by the default queue. TEST(sycl, raw_sycl_interop_context_1) { + // Make sure all queues use the same context Kokkos::Experimental::SYCL default_space; sycl::context default_context = default_space.sycl_queue().get_context(); diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp index 8ffada1dab..be093d8edc 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp +++ b/lib/kokkos/core/unit_test/sycl/TestSYCL_InterOp_Streams.cpp @@ -48,9 +48,13 @@ namespace Test { // Test Interoperability with SYCL Streams TEST(sycl, raw_sycl_queues) { - sycl::default_selector device_selector; - sycl::queue queue(device_selector); + // Make sure all queues use the same context Kokkos::initialize(); + Kokkos::Experimental::SYCL default_space; + sycl::context default_context = default_space.sycl_queue().get_context(); + + sycl::default_selector device_selector; + sycl::queue queue(default_context, device_selector); int* p = sycl::malloc_device(100, queue); using MemorySpace = typename TEST_EXECSPACE::memory_space; diff --git a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp index 1815adb4a7..b55be52264 100644 --- a/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp +++ b/lib/kokkos/tpls/desul/include/desul/atomics/Lock_Array_Cuda.hpp @@ -76,7 +76,7 @@ namespace Impl { /// instances in other translation units, we must update this CUDA global /// variable based on the Host global variable prior to running any kernels /// that will use it. -/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro. +/// That is the purpose of the ensure_cuda_lock_arrays_on_device function. __device__ #ifdef __CUDACC_RDC__ __constant__ extern @@ -138,33 +138,42 @@ namespace { static int lock_array_copied = 0; inline int eliminate_warning_for_lock_array() { return lock_array_copied; } } // namespace + +#ifdef __CUDACC_RDC__ +inline +#else +inline static +#endif + void + copy_cuda_lock_arrays_to_device() { + if (lock_array_copied == 0) { + cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_DEVICE, + &CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, + sizeof(int32_t*)); + cudaMemcpyToSymbol(CUDA_SPACE_ATOMIC_LOCKS_NODE, + &CUDA_SPACE_ATOMIC_LOCKS_NODE_h, + sizeof(int32_t*)); + } + lock_array_copied = 1; +} + } // namespace Impl } // namespace desul -/* It is critical that this code be a macro, so that it will - capture the right address for desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE - putting this in an inline function will NOT do the right thing! */ -#define DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \ - { \ - if (::desul::Impl::lock_array_copied == 0) { \ - cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE, \ - &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h, \ - sizeof(int32_t*)); \ - cudaMemcpyToSymbol(::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE, \ - &::desul::Impl::CUDA_SPACE_ATOMIC_LOCKS_NODE_h, \ - sizeof(int32_t*)); \ - } \ - ::desul::Impl::lock_array_copied = 1; \ - } #endif /* defined( __CUDACC__ ) */ #endif /* defined( DESUL_HAVE_CUDA_ATOMICS ) */ +namespace desul { + #if defined(__CUDACC_RDC__) || (!defined(__CUDACC__)) -#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() +inline void ensure_cuda_lock_arrays_on_device() {} #else -#define DESUL_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() \ - DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() +static inline void ensure_cuda_lock_arrays_on_device() { + Impl::copy_cuda_lock_arrays_to_device(); +} #endif -#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP_ */ +} // namespace desul + +#endif /* #ifndef DESUL_ATOMICS_LOCK_ARRAY_CUDA_HPP_ */ diff --git a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp index cb8482c5da..19944b378e 100644 --- a/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp +++ b/lib/kokkos/tpls/desul/src/Lock_Array_CUDA.cpp @@ -70,7 +70,7 @@ void init_lock_arrays_cuda() { "init_lock_arrays_cuda: cudaMalloc host locks"); auto error_sync1 = cudaDeviceSynchronize(); - DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + copy_cuda_lock_arrays_to_device(); check_error_and_throw_cuda(error_sync1, "init_lock_arrays_cuda: post mallocs"); init_lock_arrays_cuda_kernel<<<(CUDA_SPACE_ATOMIC_MASK + 1 + 255) / 256, 256>>>(); auto error_sync2 = cudaDeviceSynchronize(); @@ -85,7 +85,7 @@ void finalize_lock_arrays_cuda() { CUDA_SPACE_ATOMIC_LOCKS_DEVICE_h = nullptr; CUDA_SPACE_ATOMIC_LOCKS_NODE_h = nullptr; #ifdef __CUDACC_RDC__ - DESUL_IMPL_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE(); + copy_cuda_lock_arrays_to_device(); #endif }