From c27ca946603a62eb293ffe5dcbfcce6b278cc78b Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Tue, 24 Dec 2024 09:58:51 -0700 Subject: [PATCH] Update Kokkos library in LAMMPS to v4.5.1 --- lib/kokkos/CHANGELOG.md | 11 ++ lib/kokkos/CMakeLists.txt | 2 +- lib/kokkos/Makefile.kokkos | 13 +- lib/kokkos/README.md | 6 +- lib/kokkos/cmake/build_env_info.cmake | 4 +- .../containers/src/Kokkos_DynRankView.hpp | 127 +++++++++++++++++- lib/kokkos/core/unit_test/TestAtomicViews.hpp | 8 +- .../core/unit_test/TestViewBadAlloc.hpp | 6 + lib/kokkos/master_history.txt | 1 + .../experimental/__p0009_bits/config.hpp | 8 +- .../__p2630_bits/submdspan_mapping.hpp | 17 ++- 11 files changed, 177 insertions(+), 26 deletions(-) diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 6c237ebca8..84bbd03585 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,16 @@ # CHANGELOG +## 4.5.01 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.5.00...4.5.01) + +### Bug Fixes + +* Fix re-builds after cleaning the binary tree when doing `add_subdirectory` on the Kokkos source [\#7557](https://github.com/kokkos/kokkos/pull/7557) +* Update mdspan to include fix for submdspan and bracket operator with clang 15&16 [\#7559](https://github.com/kokkos/kokkos/pull/7559) +* Fix DynRankView performance regression by re-introducing shortcut operator() impls [\#7606](https://github.com/kokkos/kokkos/pull/7606) +* Add missing MI300A (`GFX942_APU`) option to Makefile build-system + ## 4.5.00 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index f0bf8e3634..6a70bea149 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -149,7 +149,7 @@ endif() set(Kokkos_VERSION_MAJOR 4) set(Kokkos_VERSION_MINOR 5) -set(Kokkos_VERSION_PATCH 0) +set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 75dcbb9536..abdfb7a316 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -1,6 +1,6 @@ # Default settings common options. -#LAMMPS specific settings: +#SPARTA specific settings: ifndef KOKKOS_PATH KOKKOS_PATH=../../lib/kokkos endif @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 4 KOKKOS_VERSION_MINOR = 5 -KOKKOS_VERSION_PATCH = 0 +KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial @@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace # IBM: Power8,Power9 -# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX942_APU,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" @@ -464,6 +464,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942_APU) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) @@ -478,6 +479,7 @@ KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX9 + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) @@ -1206,6 +1208,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942_APU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 +endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index 0ea07f9ea2..56159b35c2 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.5.00](https://github.com/kokkos/kokkos/releases/tag/4.5.00). +The current release is [4.5.01](https://github.com/kokkos/kokkos/releases/tag/4.5.01). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/releases/download/4.5.00/kokkos-4.5.00.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/lib/kokkos/cmake/build_env_info.cmake b/lib/kokkos/cmake/build_env_info.cmake index ac28b2d850..76afbb74b6 100644 --- a/lib/kokkos/cmake/build_env_info.cmake +++ b/lib/kokkos/cmake/build_env_info.cmake @@ -4,7 +4,7 @@ find_package(Git QUIET) set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -set(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(post_configure_dir ${CMAKE_CURRENT_BINARY_DIR}/generated) set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) @@ -105,7 +105,7 @@ function(check_git_setup) ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} ) - add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) + add_library(impl_git_version ${CMAKE_CURRENT_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) add_dependencies(impl_git_version AlwaysCheckGit) diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 2f2f4433e7..b860359526 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -524,7 +524,10 @@ class DynRankView : private View { std::is_same_v, is_default_map = std::is_void_v && - (is_layout_left || is_layout_right || is_layout_stride) + (is_layout_left || is_layout_right || is_layout_stride), + + is_default_access = + is_default_map && std::is_same_v }; // Bounds checking macros @@ -574,12 +577,134 @@ class DynRankView : private View { using view_type::stride_7; // FIXME: not tested using view_type::use_count; +#ifdef KOKKOS_ENABLE_CUDA KOKKOS_FUNCTION reference_type operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, index_type i6 = 0) const { return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } +#else + // Adding shortcut operators for rank-0 to rank-3 for default layouts + // and access modalities. + // This removes performance overhead for always using rank-7 mapping. + // See https://github.com/kokkos/kokkos/issues/7604 + // When boundschecking is enabled we still go through the underlying + // rank-7 View to leverage the error checks there. + + KOKKOS_FUNCTION reference_type operator()() const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 0u) + Kokkos::abort( + "DynRankView rank 0 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + return view_type::data()[0]; + } else +#endif + return view_type::operator()(0, 0, 0, 0, 0, 0, 0); + } + + KOKKOS_FUNCTION reference_type operator()(index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 1u) + Kokkos::abort( + "DynRankView rank 1 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_stride) { + return view_type::data()[i0 * view_type::stride(0)]; + } else { + return view_type::data()[i0]; + } + } else +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif + } + + KOKKOS_FUNCTION reference_type operator()(index_type i0, + index_type i1) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 2u) + Kokkos::abort( + "DynRankView rank 2 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + i1 * view_type::stride(1)]; + } else if constexpr (is_layout_right) { + return view_type::data()[i0 * view_type::extent(1) + i1]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1)]; + } + } else +#endif + return view_type::operator()(i0, i1, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif + } + + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 3u) + Kokkos::abort( + "DynRankView rank 3 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + view_type::stride(1) * + (i1 + i2 * view_type::extent(1))]; + } else if constexpr (is_layout_right) { + return view_type::data()[(i0 * view_type::extent(1) + i1) * + view_type::extent(2) + + i2]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1) + + i2 * view_type::stride(2)]; + } + } else +#endif + return view_type::operator()(i0, i1, i2, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif + } + + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2, index_type i3, + index_type i4 = 0, + index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); + } +#endif // This is an accomodation for Phalanx, that is usint the operator[] to access // all elements in a linear fashion even when the rank is not 1 diff --git a/lib/kokkos/core/unit_test/TestAtomicViews.hpp b/lib/kokkos/core/unit_test/TestAtomicViews.hpp index fa72e0b0cf..55ff62822b 100644 --- a/lib/kokkos/core/unit_test/TestAtomicViews.hpp +++ b/lib/kokkos/core/unit_test/TestAtomicViews.hpp @@ -1065,13 +1065,9 @@ T AndEqualAtomicViewCheck(const int64_t input_length) { const int64_t N = input_length; T result[2] = {1}; for (int64_t i = 0; i < N; ++i) { - if (N % 2 == 0) { - result[0] &= (T)i; - } else { - result[1] &= (T)i; - } + int64_t idx = N % 2; + result[idx] &= (T)i; } - return (result[0]); } diff --git a/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp b/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp index c876ceb787..1707a9d5d2 100644 --- a/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp +++ b/lib/kokkos/core/unit_test/TestViewBadAlloc.hpp @@ -66,6 +66,12 @@ TEST(TEST_CATEGORY, view_bad_alloc) { } #endif +#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) + if (std::is_same_v) { + GTEST_SKIP() << "MSVC/CUDA segfaults when allocating too much memory"; + } +#endif + test_view_bad_alloc(); constexpr bool execution_space_is_device = diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index 3f9e4c6e15..c9e454c1af 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -40,3 +40,4 @@ tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c tag: 4.4.01 date: 09:12:2024 master: 08ceff92 release: 2d60c039 tag: 4.5.00 date: 11:11:2024 master: 15dc143e release: 5164f2f6 +tag: 4.5.01 date: 12:19:2024 master: 09e775bf release: e0d656f9 diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp index 24166462e7..e8cacf40d6 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p0009_bits/config.hpp @@ -240,7 +240,13 @@ static_assert(_MDSPAN_CPLUSPLUS >= MDSPAN_CXX_STD_14, "mdspan requires C++14 or #ifndef MDSPAN_USE_BRACKET_OPERATOR # if defined(__cpp_multidimensional_subscript) -# define MDSPAN_USE_BRACKET_OPERATOR 1 +// The following if/else is necessary to workaround a clang issue +// relative to using a parameter pack inside a bracket operator in C++2b/C++23 mode +# if defined(_MDSPAN_COMPILER_CLANG) && ((__clang_major__ == 15) || (__clang_major__ == 16)) +# define MDSPAN_USE_BRACKET_OPERATOR 0 +# else +# define MDSPAN_USE_BRACKET_OPERATOR 1 +# endif # else # define MDSPAN_USE_BRACKET_OPERATOR 0 # endif diff --git a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp index 2a2cdf76b9..46ccbaadeb 100644 --- a/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp +++ b/lib/kokkos/tpls/mdspan/include/experimental/__p2630_bits/submdspan_mapping.hpp @@ -252,7 +252,7 @@ layout_left::mapping::submdspan_mapping_impl( *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have -// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so // disable it for CUDA altogether #if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) detail::tuple{ @@ -330,7 +330,7 @@ MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_left_padded::mapping{ @@ -485,7 +485,7 @@ layout_right::mapping::submdspan_mapping_impl( *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have -// the issue But Clang-CUDA also doesn't accept the use of deduction guide so +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so // disable it for CUDA altogether #if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple{ @@ -555,7 +555,7 @@ MDSPAN_IMPL_PROPOSED_NAMESPACE::layout_right_padded::mapping{ @@ -603,12 +603,11 @@ layout_stride::mapping::submdspan_mapping_impl( *this, inv_map, // HIP needs deduction guides to have markups so we need to be explicit // NVCC 11.0 has a bug with deduction guide here, tested that 11.2 does not have -// the issue -#if defined(_MDSPAN_HAS_HIP) || \ - (defined(__NVCC__) && \ - (__CUDACC_VER_MAJOR__ * 100 + __CUDACC_VER_MINOR__ * 10) < 1120) +// the issue but Clang-CUDA also doesn't accept the use of deduction guide so +// disable it for CUDA alltogether +#if defined(_MDSPAN_HAS_HIP) || defined(_MDSPAN_HAS_CUDA) MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple( - detail::stride_of(slices)...).values)), + detail::stride_of(slices)...)).values), #else MDSPAN_IMPL_STANDARD_NAMESPACE::detail::tuple(detail::stride_of(slices)...)).values), #endif