diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index c6115f4b3d..40e3c95f24 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,26 @@ # CHANGELOG +## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01) + +### Backend and Architecture Enhancements: + +#### CUDA: +- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746) + +#### HIP: +- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624) +- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671) + +### Build System +- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742) + +### Bug Fixes +- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653) +- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635) +- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659) +- Fix "extra `;`" warning with `-pedantic` flag in `` [\#6510](https://github.com/kokkos/kokkos/pull/6510) + ## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00) @@ -43,7 +64,7 @@ #### SYCL: - Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246) -- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) +- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264) - Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321) - Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322) - Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189) @@ -64,7 +85,7 @@ - Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474) -### Build System Changes +### Build System Changes - Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282) - Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index f6bd81058e..4a4e7a5501 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -151,7 +151,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) set(Kokkos_VERSION_MINOR 2) -set(Kokkos_VERSION_PATCH 0) +set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -252,7 +252,6 @@ ENDIF() # subpackages ## This restores the old behavior of ProjectCompilerPostConfig.cmake -# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos # We must do this before KOKKOS_PACKAGE_DECL IF (KOKKOS_HAS_TRILINOS) # Overwrite the old flags at the top-level @@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS) SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}") IF (KOKKOS_ENABLE_CUDA) STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) ENDFOREACH() - SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}") ENDIF() - # Both parent scope and this package - # In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in - # TRILINOS_TOPLEVEL_CXX_FLAGS - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE) - SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}") - #CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here #These flags get set up in KOKKOS_PACKAGE_DECL, which means they #must be configured before KOKKOS_PACKAGE_DECL SET(KOKKOS_ALL_COMPILE_OPTIONS diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index c970f72755..393422d73c 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 4 KOKKOS_VERSION_MINOR = 2 -KOKKOS_VERSION_PATCH = 0 +KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 -# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100 +# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC KOKKOS_ARCH ?= "" @@ -416,6 +416,8 @@ endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A)) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030)) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100)) @@ -1113,6 +1115,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") + KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 +endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index b962218b5f..3eb963faf2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -198,8 +198,9 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) __builtin_unreachable(); #endif } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index b32a9be3a1..b5aa27c7c3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -139,7 +139,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp index 1928f95588..21da333e75 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsPartitioned.cpp @@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, // ----------------------------------------------- auto returnView_h = create_host_space_copy(returnView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp index c0bbdfa390..78ab6bf1f8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionCopy.cpp @@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, "stdDestTrueView", numTeams, numCols); Kokkos::View stdDestFalseView( "stdDestFalseView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) { auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp index 954d461246..370e91cc1f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamPartitionPoint.cpp @@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, auto distancesView_h = create_host_space_copy(distancesView); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) { auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index 2082fa9728..ce18eb4d31 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp index 3315f281da..3dd7cb764c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveIf.cpp @@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { // ----------------------------------------------- // check against std // ----------------------------------------------- - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); auto dataViewAfterOp_h = create_host_space_copy(dataView); auto distancesView_h = create_host_space_copy(distancesView); auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index ae43a2a426..d0217aed7a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -145,7 +145,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView); Kokkos::View stdDestView("stdDestView", numTeams, numCols); - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < sourceView.extent(0); ++i) { auto rowFrom = Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp index 1d5d9578f9..d79b53d355 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceIf.cpp @@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j); } } - GreaterThanValueFunctor predicate(threshold); + GreaterThanValueFunctor predicate(threshold); for (std::size_t i = 0; i < dataView.extent(0); ++i) { auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL()); std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal); diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index bec59ebd03..9930d2abf0 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -114,6 +114,7 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX906 #cmakedefine KOKKOS_ARCH_AMD_GFX908 #cmakedefine KOKKOS_ARCH_AMD_GFX90A +#cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index bccf674d76..30764bde86 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -94,9 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K ENDIF() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942) +LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) +LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) +LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 8aae27d091..49d6c112e3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -309,6 +309,11 @@ class ParallelReduce, if (CudaTraits::WarpSize < word_count.value) { __syncthreads(); + } else if (word_count.value > 1) { + // Inside cuda_single_inter_block_reduce_scan() above, shared[i] below + // might have been updated by a single thread within a warp without + // synchronization afterwards. Synchronize threads within warp to avoid + // potential racecondition. + __syncwarp(0xffffffff); } for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 498e57f94a..b4679b4e0d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -742,6 +742,11 @@ class ParallelReduce { hipGraphExec_t m_graph_exec = nullptr; }; -GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(m_graph); if (m_graph_exec) { @@ -92,12 +92,12 @@ GraphImpl::~GraphImpl() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph)); } -GraphImpl::GraphImpl(Kokkos::HIP instance) +inline GraphImpl::GraphImpl(Kokkos::HIP instance) : m_execution_space(std::move(instance)) { KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0)); } -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node @@ -110,7 +110,7 @@ void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in it's policy template -void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -129,8 +129,8 @@ void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, - PredecessorRef arg_pred_ref) { +inline void GraphImpl::add_predecessor( + NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); KOKKOS_EXPECTS(pred_ptr); @@ -145,7 +145,7 @@ void GraphImpl::add_predecessor(NodeImplPtr arg_node_ptr, hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -void GraphImpl::submit() { +inline void GraphImpl::submit() { if (!m_graph_exec) { instantiate_graph(); } @@ -153,12 +153,12 @@ void GraphImpl::submit() { hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); } -Kokkos::HIP const& GraphImpl::get_execution_space() const +inline Kokkos::HIP const& GraphImpl::get_execution_space() const noexcept { return m_execution_space; } -auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(m_graph); KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), @@ -172,7 +172,7 @@ auto GraphImpl::create_root_node_ptr() { } template -auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { +inline auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for // each predecessor ref, so all we need to do here is create the (trivial) diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index ef140ec46c..63ad66686b 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -30,7 +30,8 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ - defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ + defined(KOKKOS_ARCH_AMD_GFX942) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 90d1404063..252aabd949 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -75,12 +75,19 @@ class HostSpace { /**\brief Non-default memory space instance to choose allocation mechansim, * if available */ - enum KOKKOS_DEPRECATED AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; +#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100 + // We see deprecation warnings even when not using the deprecated + // HostSpace constructor below when using gcc before release 11. + enum +#else + enum KOKKOS_DEPRECATED +#endif + AllocationMechanism { + STD_MALLOC, + POSIX_MEMALIGN, + POSIX_MMAP, + INTEL_MM_ALLOC + }; KOKKOS_DEPRECATED explicit HostSpace(const AllocationMechanism&); diff --git a/lib/kokkos/core/src/Kokkos_Printf.hpp b/lib/kokkos/core/src/Kokkos_Printf.hpp index af20221a5a..39f95825c3 100644 --- a/lib/kokkos/core/src/Kokkos_Printf.hpp +++ b/lib/kokkos/core/src/Kokkos_Printf.hpp @@ -31,7 +31,7 @@ namespace Kokkos { // backends. The GPU backends always return 1 and NVHPC only compiles if we // don't ask for the return value. template -KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) { +KOKKOS_FUNCTION void printf(const char* format, Args... args) { #ifdef KOKKOS_ENABLE_SYCL // Some compilers warn if "args" is empty and format is not a string literal if constexpr (sizeof...(Args) == 0) diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 44f0fbc180..12bf3b71f7 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -359,8 +359,6 @@ void OpenMPInternal::finalize() { } m_initialized = false; - - Kokkos::Profiling::finalize(); } void OpenMPInternal::print_configuration(std::ostream &s) const { diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 4586406e16..03f5fff395 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -219,6 +219,8 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions, Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions, partition_size); + OpenMP::memory_space space; + #pragma omp parallel num_threads(num_partitions) { Exec thread_local_instance(partition_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp index e81e834939..071ecdbc4f 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -58,8 +58,6 @@ void SerialInternal::finalize() { m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0); } - Kokkos::Profiling::finalize(); - m_is_initialized = false; } diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index db1567610b..67119cac16 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include +#include #include #include #include diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp index c754091e87..801a1ac82e 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -815,8 +815,6 @@ void ThreadsExec::finalize() { s_threads_process.m_pool_size = 1; s_threads_process.m_pool_fan_size = 0; s_threads_process.m_pool_state = ThreadsExec::Inactive; - - Kokkos::Profiling::finalize(); } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp index d32ef4ca23..424ba05a90 100644 --- a/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp +++ b/lib/kokkos/core/unit_test/TestMathematicalFunctions.hpp @@ -30,8 +30,9 @@ #define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS #endif -#if defined KOKKOS_COMPILER_INTEL || \ - (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130) +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable(); #else #define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE @@ -394,10 +395,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2); DEFINE_UNARY_FUNCTION_EVAL(log1p, 2); #endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2); DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2); +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 DEFINE_UNARY_FUNCTION_EVAL(sin, 2); DEFINE_UNARY_FUNCTION_EVAL(cos, 2); DEFINE_UNARY_FUNCTION_EVAL(tan, 2); @@ -483,11 +486,9 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2); }; \ constexpr char math_function_name::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(pow, 2); DEFINE_BINARY_FUNCTION_EVAL(hypot, 2); -#endif -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1); DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); #endif @@ -519,7 +520,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1); }; \ constexpr char math_function_name::name[] -#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2); DEFINE_TERNARY_FUNCTION_EVAL(fma, 2); #endif @@ -787,7 +788,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) { // TODO atan2 } +#endif +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 TEST(TEST_CATEGORY, mathematical_functions_power_functions) { TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11}); TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l}); @@ -1568,6 +1571,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) { // TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279 +#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2 template struct TestIsFinite { TestIsFinite() { run(); } @@ -1591,6 +1595,7 @@ struct TestIsFinite { ++e; Kokkos::printf("failed isfinite(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (!isfinite(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isfinite(quiet_NaN::value) || @@ -1611,6 +1616,7 @@ struct TestIsFinite { ++e; Kokkos::printf("failed isfinite(KE::bhalf_t)\n"); } +#endif if (!isfinite(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isfinite(quiet_NaN::value) || @@ -1670,6 +1676,7 @@ struct TestIsInf { ++e; Kokkos::printf("failed isinf(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (isinf(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isinf(quiet_NaN::value) || @@ -1690,6 +1697,7 @@ struct TestIsInf { ++e; Kokkos::printf("failed isinf(KE::bhalf_t)\n"); } +#endif if (isinf(3.) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || isinf(quiet_NaN::value) || @@ -1748,6 +1756,7 @@ struct TestIsNaN { ++e; Kokkos::printf("failed isnan(float)\n"); } +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if (isnan(static_cast(2.f)) #ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7 || !isnan(quiet_NaN::value) || @@ -1777,6 +1786,7 @@ struct TestIsNaN { ++e; Kokkos::printf("failed isnan(double)\n"); } +#endif #ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS if (isnan(4.l) || !isnan(quiet_NaN::value) || !isnan(signaling_NaN::value) || @@ -1803,6 +1813,7 @@ struct TestIsNaN { TEST(TEST_CATEGORY, mathematical_functions_isnan) { TestIsNaN(); } +#endif // TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279 #endif diff --git a/lib/kokkos/core/unit_test/TestNumericTraits.hpp b/lib/kokkos/core/unit_test/TestNumericTraits.hpp index 2b5531f29a..ec1c1e0ca0 100644 --- a/lib/kokkos/core/unit_test/TestNumericTraits.hpp +++ b/lib/kokkos/core/unit_test/TestNumericTraits.hpp @@ -110,8 +110,8 @@ struct TestNumericTraits { KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const { using Kokkos::Experimental::epsilon; - auto const eps = epsilon::value; - auto const one = T(1); + T const eps = epsilon::value; + T const one = 1; // Avoid higher precision intermediate representation compare() = one + eps; e += (int)!(compare() != one); diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index 1b216d9fe3..301a1fceb5 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -160,6 +160,7 @@ display_help_text() { echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906" echo " AMD_GFX908 = AMD GPU MI100 GFX908" echo " AMD_GFX90A = AMD GPU MI200 GFX90A" + echo " AMD_GFX940 = AMD GPU MI300 GFX940" echo " AMD_GFX942 = AMD GPU MI300 GFX942" echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030" echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100" diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index fd0020b8d5..a43b5276a8 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -34,3 +34,4 @@ tag: 4.0.00 date: 02:23:2023 master: 5ad60966 release: 52ea2953 tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88 +tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a diff --git a/lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp b/lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp index af7cb1e2c6..7443f5596b 100644 --- a/lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp +++ b/lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp @@ -224,7 +224,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::floor(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil( @@ -232,7 +232,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::ceil(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round( @@ -240,7 +240,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Experimental::round_half_to_nearest_even(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc( @@ -248,7 +248,7 @@ template using data_type = std::conditional_t, T, double>; return Experimental::simd( Kokkos::trunc(static_cast(a[0]))); -}; +} template [[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION diff --git a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp index 4feff3a89d..4af08c266b 100644 --- a/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp +++ b/lib/kokkos/simd/unit_tests/include/TestSIMD_GeneratorCtors.hpp @@ -42,6 +42,7 @@ inline void host_check_gen_ctor() { simd_type blend; blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag()); +#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) if constexpr (std::is_same_v) { simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; }); host_check_equality(basic, rhs, lanes); @@ -63,6 +64,7 @@ inline void host_check_gen_ctor() { host_check_equality(blend, result, lanes); } +#endif } template