diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 4b4a7ae6f2..234c1f140d 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -37,8 +37,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "c2fdcedb6953e6160c765366f6045abb" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "4c84698917c93a18985b311bb6caf84f" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) ExternalProject_Add(kokkos_build @@ -58,7 +58,7 @@ if(DOWNLOAD_KOKKOS) target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS) add_dependencies(LAMMPS::KOKKOS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 3.4.00 REQUIRED CONFIG) + find_package(Kokkos 3.4.01 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) target_link_libraries(lmp PRIVATE Kokkos::kokkos) else() diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 8be4b2309f..423897fd7a 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -457,6 +457,9 @@ They must be specified in uppercase. * - ZEN2 - HOST - AMD Zen2 class CPU (AVX 2) + * - ZEN3 + - HOST + - AMD Zen3 class CPU (AVX 2) * - ARMV80 - HOST - ARMv8.0 Compatible CPU @@ -560,7 +563,7 @@ They must be specified in uppercase. - GPU - Intel GPUs Gen9+ -This list was last updated for version 3.4 of the Kokkos library. +This list was last updated for version 3.4.1 of the Kokkos library. .. tabs:: diff --git a/lib/kokkos/BUILD.md b/lib/kokkos/BUILD.md index e1f0e3e472..bb1a31f266 100644 --- a/lib/kokkos/BUILD.md +++ b/lib/kokkos/BUILD.md @@ -262,6 +262,9 @@ Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_ * Kokkos_ARCH_ZEN2 * Whether to optimize for the Zen2 architecture * BOOL Default: OFF +* Kokkos_ARCH_ZEN3 + * Whether to optimize for the Zen3 architecture + * BOOL Default: OFF * Kokkos_ARCH_HSW * Whether to optimize for the HSW architecture * BOOL Default: OFF diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 3ce38c37d8..7bb6de4cd9 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,20 @@ # Change Log +## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01) + +**Bug Fixes:** +- Windows: Remove atomic_compare_exchange_strong overload conflicts with Windows [\#4024](https://github.com/kokkos/kokkos/pull/4024) +- OpenMPTarget: Fixup allocation headers with OpenMPTarget backend [\#4020](https://github.com/kokkos/kokkos/pull/4020) +- OpenMPTarget: Add missing specailization for OMPT to Kokkos Random [\#4022](https://github.com/kokkos/kokkos/pull/4022) +- AMD: Add support for AMD Zen3 CPU architecture [\#4021](https://github.com/kokkos/kokkos/pull/4021) +- SYCL: Implement SYCL::print_configuration [\#4012](https://github.com/kokkos/kokkos/pull/4012) +- Containers: staticcsrgraph: use device type instead of execution space to construct views [\#3998](https://github.com/kokkos/kokkos/pull/3998) +- nvcc_wrapper: fix errors in argument handling, suppress duplicates of GPU architecture and RDC flags [\#4006](https://github.com/kokkos/kokkos/pull/4006) +- CI: Add icpx testing to intel container [\#4004](https://github.com/kokkos/kokkos/pull/4004) +- CMake/TRIBITS: Keep quoted compiler flags when passing to Trilinos [\#4007](https://github.com/kokkos/kokkos/pull/4007) +- CMake: Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945) + ## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 6fc1bf7d2f..9452027d8e 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -112,7 +112,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 00) +set(Kokkos_VERSION_PATCH 01) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -206,8 +206,13 @@ ENDIF() IF (KOKKOS_HAS_TRILINOS) # Overwrite the old flags at the top-level # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS}") + # we have to match the annoying behavior, also we have to preserve quotes + # which needs another workaround. + SET(KOKKOS_COMPILE_OPTIONS_TMP) + FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) + LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP \"${OPTION}\") + ENDFOREACH() + STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) IF (KOKKOS_ENABLE_CUDA) LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index aa97f99b75..013d2b3ede 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -12,7 +12,7 @@ endif KOKKOS_VERSION_MAJOR = 3 KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 00 +KOKKOS_VERSION_PATCH = 01 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,OpenMP,Pthread,Serial @@ -24,7 +24,7 @@ KOKKOS_DEVICES ?= "OpenMP" # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908 -# AMD-CPUS: AMDAVX,Zen,Zen2 +# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 KOKKOS_ARCH ?= "" # Options: yes,no KOKKOS_DEBUG ?= "no" @@ -382,6 +382,7 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ # AMD based. KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) +KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3) KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2) KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen) KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900) @@ -391,12 +392,12 @@ KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega # Any AVX? KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2)) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3)) KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) @@ -790,6 +791,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2") + + ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) + KOKKOS_CXXFLAGS += -mavx2 + KOKKOS_LDFLAGS += -mavx2 + else + KOKKOS_CXXFLAGS += -march=znver3 -mtune=znver3 + KOKKOS_LDFLAGS += -march=znver3 -mtune=znver3 + endif +endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80") tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX") @@ -1181,7 +1195,6 @@ endif ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h) -endif # Functions for generating config header file kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1) @@ -1232,6 +1245,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif +endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index fd099054ba..4df76a1dbb 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -5,10 +5,12 @@ KOKKOS_SUBPACKAGE(Algorithms) IF (NOT Kokkos_INSTALL_TESTING) ADD_SUBDIRECTORY(src) ENDIF() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +IF(NOT (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR + KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))) + KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) +ENDIF() KOKKOS_SUBPACKAGE_POSTPROCESS() - diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 904cf5ccb9..55ce19971f 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -687,6 +687,24 @@ struct Random_UniqueIndex { }; #endif +#ifdef KOKKOS_ENABLE_OPENMPTARGET +template <> +struct Random_UniqueIndex { + using locks_view_type = View; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks) { + const int team_size = omp_get_num_threads(); + int i = omp_get_team_num() * team_size + omp_get_thread_num(); + const int lock_size = locks.extent_int(0); + + while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) { + i = (i + 1) % lock_size; + } + return i; + } +}; +#endif + } // namespace Impl template diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 9109837985..50f8f0a332 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -44,7 +44,7 @@ IF(Kokkos_ENABLE_OPENMP) ) ENDIF() -foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Because there is always an exception to the rule if(Tag STREQUAL "Threads") set(DEVICE "PTHREAD") diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 1f14875096..c37e779c99 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -109,6 +109,16 @@ struct RandomProperties { } }; +// FIXME_OPENMPTARGET: Need this for OpenMPTarget because contra to the standard +// llvm requires the binary operator defined not just the += +KOKKOS_INLINE_FUNCTION +RandomProperties operator+(const RandomProperties& org, + const RandomProperties& add) { + RandomProperties val = org; + val += add; + return val; +} + template struct test_random_functor { using rnd_type = typename GeneratorPool::generator_type; diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index a3c362ec20..9c6308c843 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -370,7 +370,10 @@ template void test_sort(unsigned int N) { test_1D_sort(N); test_3D_sort(N); +// FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet. +#ifndef KOKKOS_ENABLE_OPENMPTARGET test_dynamic_view_sort(N); +#endif test_issue_1160_sort(); } } // namespace Impl diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 5556e888e3..4e52e4d09f 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -67,6 +67,11 @@ shared_versioned_libraries="" # Does the User set the architecture arch_set=0 +arch_flag="" + +# Does the user set RDC? +rdc_set=0 +rdc_flag="" # Does the user overwrite the host compiler ccbin_set=0 @@ -190,8 +195,34 @@ do host_only_args="$host_only_args $1 $2" shift ;; + # Handle nvcc args controlling whether to generated relocatable device code + --relocatable-device-code=*|-rdc=*) + if [ "$rdc_set" -eq 0 ]; then + rdc_set=1 + rdc_flag="$1" + cuda_args="$cuda_args $rdc_flag" + elif [ "$rdc_flag" != "$1" ]; then + echo "RDC is being set twice with different flags, which is not handled" + echo "$rdc_flag" + echo "$1" + exit 1 + fi + ;; + -rdc) + if [ "$rdc_set" -eq 0 ]; then + rdc_set=1 + rdc_flag="$1 $2" + cuda_args="$cuda_args $rdc_flag" + shift + elif [ "$rdc_flag" != "$1 $2" ]; then + echo "RDC is being set twice with different flags, which is not handled" + echo "$rdc_flag" + echo "$1 $2" + exit 1 + fi + ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|--verbose|--keep|--keep-dir*|-G|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad=*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args @@ -199,13 +230,13 @@ do cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -rdc|-maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) + -maxrregcount=*|--maxrregcount=*) + cuda_args="$cuda_args $1" + ;; + -maxrregcount|--default-stream|-Xnvlink|--fmad|-cudart|--cudart|-include) cuda_args="$cuda_args $1 $2" shift ;; - -rdc=*|-maxrregcount*|--maxrregcount*) - cuda_args="$cuda_args $1" - ;; #Handle unsupported standard flags --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a) fallback_std_flag="-std=c++14" @@ -323,20 +354,36 @@ do ;; #Handle -arch argument (if its not set use a default) this is the version with = sign - -arch*|-gencode*) - cuda_args="$cuda_args $1" - arch_set=1 + -arch=*|-gencode=*) + if [ "$arch_set" -eq 0 ]; then + arch_set=1 + arch_flag="$1" + cuda_args="$cuda_args $arch_flag" + elif [ "$arch_flag" != "$1" ]; then + echo "ARCH is being set twice with different flags, which is not handled" + echo "$arch_flag" + echo "$1" + exit 1 + fi + ;; + #Handle -arch argument (if its not set use a default) this is the version without = sign + -arch|-gencode) + if [ "$arch_set" -eq 0 ]; then + arch_set=1 + arch_flag="$1 $2" + cuda_args="$cuda_args $arch_flag" + shift + elif [ "$arch_flag" != "$1 $2" ]; then + echo "ARCH is being set twice with different flags, which is not handled" + echo "$arch_flag" + echo "$1 $2" + exit 1 + fi ;; #Handle -code argument (if its not set use a default) this is the version with = sign -code*) cuda_args="$cuda_args $1" ;; - #Handle -arch argument (if its not set use a default) this is the version without = sign - -arch|-gencode) - cuda_args="$cuda_args $1 $2" - arch_set=1 - shift - ;; #Handle -code argument (if its not set use a default) this is the version without = sign -code) cuda_args="$cuda_args $1 $2" diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index fbfae3711e..3455b0cb42 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -99,5 +99,6 @@ #cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 +#cmakedefine KOKKOS_ARCH_AMD_ZEN3 #cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index ec18e70a36..e8b85542c6 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -63,6 +63,7 @@ KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") +KOKKOS_ARCH_OPTION(ZEN3 HOST "AMD Zen3 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") @@ -215,6 +216,15 @@ IF (KOKKOS_ARCH_ZEN2) SET(KOKKOS_ARCH_AMD_AVX2 ON) ENDIF() +IF (KOKKOS_ARCH_ZEN3) + COMPILER_SPECIFIC_FLAGS( + Intel -mavx2 + DEFAULT -march=znver3 -mtune=znver3 + ) + SET(KOKKOS_ARCH_AMD_ZEN3 ON) + SET(KOKKOS_ARCH_AMD_AVX2 ON) +ENDIF() + IF (KOKKOS_ARCH_WSM) COMPILER_SPECIFIC_FLAGS( Intel -xSSE4.2 @@ -284,7 +294,7 @@ IF (KOKKOS_ARCH_SKX) ) ENDIF() -IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2) +IF (KOKKOS_ARCH_WSM OR KOKKOS_ARCH_SNB OR KOKKOS_ARCH_HSW OR KOKKOS_ARCH_BDW OR KOKKOS_ARCH_KNL OR KOKKOS_ARCH_SKX OR KOKKOS_ARCH_ZEN OR KOKKOS_ARCH_ZEN2 OR KOKKOS_ARCH_ZEN3) SET(KOKKOS_USE_ISA_X86_64 ON) ENDIF() @@ -457,7 +467,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) ENDIF() IF (KOKKOS_ARCH_INTEL_GEN) COMPILER_SPECIFIC_FLAGS( - IntelClang -fopenmp-targets=spir64 -D__STRICT_ANSI__ + IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ ) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index 4434d6928f..23847263a9 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -101,7 +101,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelClang CACHE STRING INTERNAL FORCE) + SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index 445dad47ce..d7f83ddbdf 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -61,7 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} - IntelClang -Xcompiler -fiopenmp + IntelLLVM -Xcompiler -fiopenmp PGI -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp @@ -70,7 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP) ELSE() COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} - IntelClang -fiopenmp + IntelLLVM -fiopenmp AppleClang -Xpreprocessor -fopenmp PGI -mp Cray NO-VALUE-SPECIFIED @@ -92,7 +92,7 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelClang -fiopenmp -Wno-openmp-mapping + IntelLLVM -fiopenmp -Wno-openmp-mapping XL -qsmp=omp -qoffload -qnoeh PGI -mp=gpu DEFAULT -fopenmp diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 858322394d..e1a3e5f8bd 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -773,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp index f22e5d1eca..00d3eafd23 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp @@ -114,15 +114,11 @@ namespace Kokkos { template inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( const std::string& label, const std::vector& input) { - using output_type = StaticCrsGraphType; - // using input_type = std::vector; // unused - + using output_type = StaticCrsGraphType; using entries_type = typename output_type::entries_type; - - using work_type = View; + using work_type = View< + typename output_type::size_type[], typename output_type::array_layout, + typename output_type::device_type, typename output_type::memory_traits>; output_type output; @@ -161,10 +157,9 @@ inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( static_assert(entries_type::rank == 1, "Graph entries view must be rank one"); - using work_type = View; + using work_type = View< + typename output_type::size_type[], typename output_type::array_layout, + typename output_type::device_type, typename output_type::memory_traits>; output_type output; diff --git a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp index dc5e0194ab..58d723ac11 100644 --- a/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp @@ -179,8 +179,6 @@ class SharedAllocationRecord const RecordBase::function_type arg_dealloc = &deallocate); public: - std::string get_label() const; - KOKKOS_INLINE_FUNCTION static SharedAllocationRecord* allocate( const Kokkos::Experimental::OpenMPTargetSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size) { @@ -190,10 +188,6 @@ class SharedAllocationRecord return nullptr; #endif } - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); }; } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_SYCL.hpp b/lib/kokkos/core/src/Kokkos_SYCL.hpp index aa720371df..8ee76b4386 100644 --- a/lib/kokkos/core/src/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/Kokkos_SYCL.hpp @@ -113,7 +113,7 @@ class SYCL { void fence() const; /// \brief Print configuration information to the given output stream. - static void print_configuration(std::ostream&, const bool detail = false); + void print_configuration(std::ostream&, const bool detail = false); /// \brief Free any resources being consumed by the device. static void impl_finalize(); @@ -131,12 +131,10 @@ class SYCL { sycl::device get_device() const; friend std::ostream& operator<<(std::ostream& os, const SYCLDevice& that) { - return that.info(os); + return SYCL::impl_sycl_info(os, that.m_device); } private: - std::ostream& info(std::ostream& os) const; - sycl::device m_device; }; @@ -154,6 +152,9 @@ class SYCL { } private: + static std::ostream& impl_sycl_info(std::ostream& os, + const sycl::device& device); + Kokkos::Impl::HostSharedPtr m_space_instance; }; diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp index 6fbb4245b8..b99b0017ca 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp @@ -107,12 +107,6 @@ SharedAllocationRecord::m_alloc_size); } -// TODO: Implement deep copy back see CudaSpace -std::string SharedAllocationRecord::get_label() const { - return std::string("OpenMPTargetAllocation"); -} - SharedAllocationRecord:: SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace &arg_space, @@ -141,23 +135,6 @@ SharedAllocationRecord:: //---------------------------------------------------------------------------- -void *SharedAllocationRecord:: - reallocate_tracked(void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - // Kokkos::Impl::DeepCopy( r_new->data() - // , r_old->data() - // , std::min( r_old->size() , - // r_new->size() ) ); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 9c29eb190d..3a09ee9195 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -105,6 +105,12 @@ bool SYCL::impl_is_initialized() { void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } +void SYCL::print_configuration(std::ostream& s, const bool detailed) { + s << "macro KOKKOS_ENABLE_SYCL : defined" << '\n'; + if (detailed) + SYCL::impl_sycl_info(s, m_space_instance->m_queue->get_device()); +} + void SYCL::fence() const { Impl::SYCLInternal::fence(*m_space_instance->m_queue); } @@ -143,119 +149,118 @@ void SYCL::impl_initialize(SYCL::SYCLDevice d) { Impl::SYCLInternal::singleton().initialize(d.get_device()); } -std::ostream& SYCL::SYCLDevice::info(std::ostream& os) const { +std::ostream& SYCL::impl_sycl_info(std::ostream& os, + const sycl::device& device) { using namespace sycl::info; - return os << "Name: " << m_device.get_info() - << "\nDriver Version: " - << m_device.get_info() - << "\nIs Host: " << m_device.is_host() - << "\nIs CPU: " << m_device.is_cpu() - << "\nIs GPU: " << m_device.is_gpu() - << "\nIs Accelerator: " << m_device.is_accelerator() - << "\nVendor Id: " << m_device.get_info() + return os << "Name: " << device.get_info() + << "\nDriver Version: " << device.get_info() + << "\nIs Host: " << device.is_host() + << "\nIs CPU: " << device.is_cpu() + << "\nIs GPU: " << device.is_gpu() + << "\nIs Accelerator: " << device.is_accelerator() + << "\nVendor Id: " << device.get_info() << "\nMax Compute Units: " - << m_device.get_info() + << device.get_info() << "\nMax Work Item Dimensions: " - << m_device.get_info() + << device.get_info() << "\nMax Work Group Size: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Char: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Short: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Int: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Long: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Float: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Double: " - << m_device.get_info() + << device.get_info() << "\nPreferred Vector Width Half: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Char: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Short: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Int: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Long: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Float: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Double: " - << m_device.get_info() + << device.get_info() << "\nNative Vector Width Half: " - << m_device.get_info() - << "\nAddress Bits: " << m_device.get_info() - << "\nImage Support: " << m_device.get_info() + << device.get_info() + << "\nAddress Bits: " << device.get_info() + << "\nImage Support: " << device.get_info() << "\nMax Mem Alloc Size: " - << m_device.get_info() + << device.get_info() << "\nMax Read Image Args: " - << m_device.get_info() + << device.get_info() << "\nImage2d Max Width: " - << m_device.get_info() + << device.get_info() << "\nImage2d Max Height: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Width: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Height: " - << m_device.get_info() + << device.get_info() << "\nImage3d Max Depth: " - << m_device.get_info() + << device.get_info() << "\nImage Max Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nImage Max Array Size: " - << m_device.get_info() - << "\nMax Samplers: " << m_device.get_info() + << device.get_info() + << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " - << m_device.get_info() + << device.get_info() << "\nMem Base Addr Align: " - << m_device.get_info() + << device.get_info() << "\nGlobal Cache Mem Line Size: " - << m_device.get_info() + << device.get_info() << "\nGlobal Mem Cache Size: " - << m_device.get_info() + << device.get_info() << "\nGlobal Mem Size: " - << m_device.get_info() + << device.get_info() << "\nMax Constant Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nMax Constant Args: " - << m_device.get_info() - << "\nLocal Mem Size: " - << m_device.get_info() + << device.get_info() + << "\nLocal Mem Size: " << device.get_info() << "\nError Correction Support: " - << m_device.get_info() + << device.get_info() << "\nHost Unified Memory: " - << m_device.get_info() + << device.get_info() << "\nProfiling Timer Resolution: " - << m_device.get_info() + << device.get_info() << "\nIs Endian Little: " - << m_device.get_info() - << "\nIs Available: " << m_device.get_info() + << device.get_info() + << "\nIs Available: " << device.get_info() << "\nIs Compiler Available: " - << m_device.get_info() + << device.get_info() << "\nIs Linker Available: " - << m_device.get_info() + << device.get_info() << "\nQueue Profiling: " - << m_device.get_info() + << device.get_info() << "\nBuilt In Kernels: " << Container>( - m_device.get_info()) - << "\nVendor: " << m_device.get_info() - << "\nProfile: " << m_device.get_info() - << "\nVersion: " << m_device.get_info() + device.get_info()) + << "\nVendor: " << device.get_info() + << "\nProfile: " << device.get_info() + << "\nVersion: " << device.get_info() << "\nExtensions: " << Container>( - m_device.get_info()) + device.get_info()) << "\nPrintf Buffer Size: " - << m_device.get_info() + << device.get_info() << "\nPreferred Interop User Sync: " - << m_device.get_info() + << device.get_info() << "\nPartition Max Sub Devices: " - << m_device.get_info() + << device.get_info() << "\nReference Count: " - << m_device.get_info() << '\n'; + << device.get_info() << '\n'; } namespace Impl { @@ -293,15 +298,13 @@ void SYCLSpaceInitializer::fence() { } void SYCLSpaceInitializer::print_configuration(std::ostream& msg, - const bool /*detail*/) { + const bool detail) { msg << "Devices:" << std::endl; msg << " KOKKOS_ENABLE_SYCL: "; msg << "yes" << std::endl; msg << "\nRuntime Configuration:" << std::endl; - // FIXME_SYCL not implemented - std::abort(); - // Experimental::SYCL::print_configuration(msg, detail); + Experimental::SYCL{}.print_configuration(msg, detail); } } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp index 3f2e8914ea..2f824566b8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp @@ -152,12 +152,6 @@ inline T atomic_compare_exchange( ((LONGLONG*)&compare_and_result)); return compare_and_result; } - -template -inline T atomic_compare_exchange_strong(volatile T* const dest, - const T& compare, const T& val) { - return atomic_compare_exchange(dest, compare, val); -} #endif } // namespace Kokkos diff --git a/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash b/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash index 5ff781b96f..8fe8e2b5ec 100755 --- a/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash +++ b/lib/kokkos/core/unit_test/configuration/test-code/test_config_arch_list.bash @@ -4,7 +4,7 @@ HostArch=(SNB HSW SKX KNL) DeviceArch=(Kepler35 Kepler37 Pascal60 Pascal61 Volta70) if [ ! -z "$KOKKOS_HOST_ARCH_TEST" ]; then export KOKKOS_ARCH_TEST=1 - HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 ARMv8_ThunderX ARMv8_ThunderX2) + HostArch=(WSM SNB HSW SKX WSM AMDAVX ARMv80 ARMv81 BDW KNC KNL BGQ Power7 Power8 Power9 Zen Zen2 Zen3 ARMv8_ThunderX ARMv8_ThunderX2) DeviceArch=() fi diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index e9871b4369..c601e0ee16 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -157,6 +157,7 @@ display_help_text() { echo " AMDAVX = AMD CPU" echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" + echo " ZEN3 = AMD Zen3-Core CPU" echo " [AMD: GPU]" echo " VEGA900 = AMD GPU MI25 GFX900" echo " VEGA906 = AMD GPU MI50/MI60 GFX906" diff --git a/lib/kokkos/gnu_generate_makefile.bash b/lib/kokkos/gnu_generate_makefile.bash index ea509669f0..8a463270c8 100755 --- a/lib/kokkos/gnu_generate_makefile.bash +++ b/lib/kokkos/gnu_generate_makefile.bash @@ -137,6 +137,7 @@ do echo " AMDAVX = AMD CPU" echo " ZEN = AMD Zen-Core CPU" echo " ZEN2 = AMD Zen2-Core CPU" + echo " ZEN3 = AMD Zen3-Core CPU" echo " [ARM]" echo " ARMv80 = ARMv8.0 Compatible CPU" echo " ARMv81 = ARMv8.1 Compatible CPU" diff --git a/lib/kokkos/master_history.txt b/lib/kokkos/master_history.txt index 7a58f593d0..be8a5e7da5 100644 --- a/lib/kokkos/master_history.txt +++ b/lib/kokkos/master_history.txt @@ -24,3 +24,4 @@ tag: 3.2.00 date: 08:19:2020 master: 3b2fdc7e release: 5dc6d303 tag: 3.3.00 date: 12:16:2020 master: 734f577a release: 1535ba5c tag: 3.3.01 date: 01:06:2021 master: 6d65b5a3 release: 4d23839c tag: 3.4.00 date: 04:26:2021 master: 1fb0c284 release: 5d7738d6 +tag: 3.4.01 date: 05:20:2021 master: 4b97a22f release: 410b15c8