Update Kokkos library in LAMMPS to v4.2.1
This commit is contained in:
@ -1,5 +1,26 @@
|
|||||||
# CHANGELOG
|
# CHANGELOG
|
||||||
|
|
||||||
|
## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07)
|
||||||
|
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01)
|
||||||
|
|
||||||
|
### Backend and Architecture Enhancements:
|
||||||
|
|
||||||
|
#### CUDA:
|
||||||
|
- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746)
|
||||||
|
|
||||||
|
#### HIP:
|
||||||
|
- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624)
|
||||||
|
- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671)
|
||||||
|
|
||||||
|
### Build System
|
||||||
|
- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742)
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653)
|
||||||
|
- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635)
|
||||||
|
- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659)
|
||||||
|
- Fix "extra `;`" warning with `-pedantic` flag in `<Kokkos_SIMD_Scalar.hpp>` [\#6510](https://github.com/kokkos/kokkos/pull/6510)
|
||||||
|
|
||||||
## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06)
|
## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06)
|
||||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00)
|
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00)
|
||||||
|
|
||||||
@ -43,7 +64,7 @@
|
|||||||
|
|
||||||
#### SYCL:
|
#### SYCL:
|
||||||
- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246)
|
- Enforce external `sycl::queues` to be in-order [\#6246](https://github.com/kokkos/kokkos/pull/6246)
|
||||||
- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264)
|
- Improve reduction performance: [\#6272](https://github.com/kokkos/kokkos/pull/6272) [\#6271](https://github.com/kokkos/kokkos/pull/6271) [\#6270](https://github.com/kokkos/kokkos/pull/6270) [\#6264](https://github.com/kokkos/kokkos/pull/6264)
|
||||||
- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321)
|
- Allow using the SYCL execution space on AMD GPUs [\#6321](https://github.com/kokkos/kokkos/pull/6321)
|
||||||
- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322)
|
- Allow sorting via native oneDPL to support Views with stride=1 [\#6322](https://github.com/kokkos/kokkos/pull/6322)
|
||||||
- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189)
|
- Make in-order queues the default via macro [\#6189](https://github.com/kokkos/kokkos/pull/6189)
|
||||||
@ -64,7 +85,7 @@
|
|||||||
- Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474)
|
- Add converting assignment to `DualView`: [\#6474](https://github.com/kokkos/kokkos/pull/6474)
|
||||||
|
|
||||||
|
|
||||||
### Build System Changes
|
### Build System Changes
|
||||||
|
|
||||||
- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282)
|
- Export `Kokkos_CXX_COMPILER_VERSION` [\#6282](https://github.com/kokkos/kokkos/pull/6282)
|
||||||
- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342)
|
- Disable default oneDPL support in Trilinos [\#6342](https://github.com/kokkos/kokkos/pull/6342)
|
||||||
|
|||||||
@ -151,7 +151,7 @@ ENDIF()
|
|||||||
|
|
||||||
set(Kokkos_VERSION_MAJOR 4)
|
set(Kokkos_VERSION_MAJOR 4)
|
||||||
set(Kokkos_VERSION_MINOR 2)
|
set(Kokkos_VERSION_MINOR 2)
|
||||||
set(Kokkos_VERSION_PATCH 0)
|
set(Kokkos_VERSION_PATCH 1)
|
||||||
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
||||||
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
|
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
|
||||||
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
||||||
@ -252,7 +252,6 @@ ENDIF()
|
|||||||
# subpackages
|
# subpackages
|
||||||
|
|
||||||
## This restores the old behavior of ProjectCompilerPostConfig.cmake
|
## This restores the old behavior of ProjectCompilerPostConfig.cmake
|
||||||
# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos
|
|
||||||
# We must do this before KOKKOS_PACKAGE_DECL
|
# We must do this before KOKKOS_PACKAGE_DECL
|
||||||
IF (KOKKOS_HAS_TRILINOS)
|
IF (KOKKOS_HAS_TRILINOS)
|
||||||
# Overwrite the old flags at the top-level
|
# Overwrite the old flags at the top-level
|
||||||
@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS)
|
|||||||
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
|
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
|
||||||
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
|
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
|
||||||
ENDFOREACH()
|
ENDFOREACH()
|
||||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
|
|
||||||
IF (KOKKOS_ENABLE_CUDA)
|
IF (KOKKOS_ENABLE_CUDA)
|
||||||
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
|
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
|
||||||
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
|
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
|
||||||
SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
|
SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
|
||||||
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
|
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
|
||||||
ENDFOREACH()
|
ENDFOREACH()
|
||||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}")
|
|
||||||
ENDIF()
|
ENDIF()
|
||||||
# Both parent scope and this package
|
|
||||||
# In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in
|
|
||||||
# TRILINOS_TOPLEVEL_CXX_FLAGS
|
|
||||||
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE)
|
|
||||||
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}")
|
|
||||||
#CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here
|
|
||||||
#These flags get set up in KOKKOS_PACKAGE_DECL, which means they
|
#These flags get set up in KOKKOS_PACKAGE_DECL, which means they
|
||||||
#must be configured before KOKKOS_PACKAGE_DECL
|
#must be configured before KOKKOS_PACKAGE_DECL
|
||||||
SET(KOKKOS_ALL_COMPILE_OPTIONS
|
SET(KOKKOS_ALL_COMPILE_OPTIONS
|
||||||
|
|||||||
@ -12,7 +12,7 @@ endif
|
|||||||
|
|
||||||
KOKKOS_VERSION_MAJOR = 4
|
KOKKOS_VERSION_MAJOR = 4
|
||||||
KOKKOS_VERSION_MINOR = 2
|
KOKKOS_VERSION_MINOR = 2
|
||||||
KOKKOS_VERSION_PATCH = 0
|
KOKKOS_VERSION_PATCH = 1
|
||||||
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
||||||
|
|
||||||
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
||||||
@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
|||||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
|
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
|
||||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||||
# IBM: BGQ,Power7,Power8,Power9
|
# IBM: BGQ,Power7,Power8,Power9
|
||||||
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100
|
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100
|
||||||
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
|
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
|
||||||
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
|
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
|
||||||
KOKKOS_ARCH ?= ""
|
KOKKOS_ARCH ?= ""
|
||||||
@ -416,6 +416,8 @@ endif
|
|||||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940)
|
||||||
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942)
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
|
||||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
|
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
|
||||||
|
|
||||||
@ -1113,6 +1115,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1)
|
|||||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||||
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
|
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1)
|
||||||
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940")
|
||||||
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||||
|
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940
|
||||||
|
endif
|
||||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
|
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
|
||||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
|
||||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||||
|
|||||||
@ -198,8 +198,9 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) {
|
|||||||
|
|
||||||
// this is needed for intel to avoid
|
// this is needed for intel to avoid
|
||||||
// error #1011: missing return statement at end of non-void function
|
// error #1011: missing return statement at end of non-void function
|
||||||
#if defined KOKKOS_COMPILER_INTEL || \
|
#if defined KOKKOS_COMPILER_INTEL || \
|
||||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
|
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
|
||||||
|
!defined(KOKKOS_COMPILER_MSVC))
|
||||||
__builtin_unreachable();
|
__builtin_unreachable();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|||||||
@ -139,7 +139,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
|||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||||
numTeams, numCols);
|
numTeams, numCols);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
||||||
auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
|
auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||||
auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
|
auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
|||||||
// -----------------------------------------------
|
// -----------------------------------------------
|
||||||
auto returnView_h = create_host_space_copy(returnView);
|
auto returnView_h = create_host_space_copy(returnView);
|
||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
|
|
||||||
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
||||||
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
|||||||
"stdDestTrueView", numTeams, numCols);
|
"stdDestTrueView", numTeams, numCols);
|
||||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView(
|
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView(
|
||||||
"stdDestFalseView", numTeams, numCols);
|
"stdDestFalseView", numTeams, numCols);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
|
|
||||||
for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
|
for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
|
||||||
auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
|
auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
|||||||
auto distancesView_h = create_host_space_copy(distancesView);
|
auto distancesView_h = create_host_space_copy(distancesView);
|
||||||
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
|
|
||||||
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
||||||
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
|||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||||
numTeams, numCols);
|
numTeams, numCols);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
|
for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
|
||||||
auto rowFrom =
|
auto rowFrom =
|
||||||
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
|||||||
// -----------------------------------------------
|
// -----------------------------------------------
|
||||||
// check against std
|
// check against std
|
||||||
// -----------------------------------------------
|
// -----------------------------------------------
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
||||||
auto distancesView_h = create_host_space_copy(distancesView);
|
auto distancesView_h = create_host_space_copy(distancesView);
|
||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
|
|||||||
@ -145,7 +145,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
|||||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||||
numTeams, numCols);
|
numTeams, numCols);
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
||||||
auto rowFrom =
|
auto rowFrom =
|
||||||
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||||
|
|||||||
@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
|||||||
stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j);
|
stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
GreaterThanValueFunctor predicate(threshold);
|
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||||
for (std::size_t i = 0; i < dataView.extent(0); ++i) {
|
for (std::size_t i = 0; i < dataView.extent(0); ++i) {
|
||||||
auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL());
|
auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL());
|
||||||
std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal);
|
std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal);
|
||||||
|
|||||||
@ -114,6 +114,7 @@
|
|||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX906
|
#cmakedefine KOKKOS_ARCH_AMD_GFX906
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX908
|
#cmakedefine KOKKOS_ARCH_AMD_GFX908
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX90A
|
#cmakedefine KOKKOS_ARCH_AMD_GFX90A
|
||||||
|
#cmakedefine KOKKOS_ARCH_AMD_GFX940
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX942
|
#cmakedefine KOKKOS_ARCH_AMD_GFX942
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX1030
|
#cmakedefine KOKKOS_ARCH_AMD_GFX1030
|
||||||
#cmakedefine KOKKOS_ARCH_AMD_GFX1100
|
#cmakedefine KOKKOS_ARCH_AMD_GFX1100
|
||||||
|
|||||||
@ -94,9 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K
|
|||||||
ENDIF()
|
ENDIF()
|
||||||
|
|
||||||
# AMD archs ordered in decreasing priority of autodetection
|
# AMD archs ordered in decreasing priority of autodetection
|
||||||
LIST(APPEND SUPPORTED_AMD_GPUS MI300)
|
LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300)
|
||||||
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942)
|
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940)
|
||||||
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942)
|
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940)
|
||||||
LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100)
|
LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100)
|
||||||
LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908)
|
LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908)
|
||||||
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908)
|
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908)
|
||||||
|
|||||||
@ -309,6 +309,11 @@ class ParallelReduce<CombinedFunctorReducerType,
|
|||||||
|
|
||||||
if (CudaTraits::WarpSize < word_count.value) {
|
if (CudaTraits::WarpSize < word_count.value) {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
} else {
|
||||||
|
// In the above call to final(), shared might have been updated by a
|
||||||
|
// single thread within a warp without synchronization. Synchronize
|
||||||
|
// threads within warp to avoid potential race condition.
|
||||||
|
__syncwarp(0xffffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||||
|
|||||||
@ -243,6 +243,12 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
|
|||||||
|
|
||||||
if (CudaTraits::WarpSize < word_count.value) {
|
if (CudaTraits::WarpSize < word_count.value) {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
} else if (word_count.value > 1) {
|
||||||
|
// Inside cuda_single_inter_block_reduce_scan() above, shared[i] below
|
||||||
|
// might have been updated by a single thread within a warp without
|
||||||
|
// synchronization afterwards. Synchronize threads within warp to avoid
|
||||||
|
// potential racecondition.
|
||||||
|
__syncwarp(0xffffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||||
|
|||||||
@ -742,6 +742,11 @@ class ParallelReduce<CombinedFunctorReducerType,
|
|||||||
|
|
||||||
if (CudaTraits::WarpSize < word_count.value) {
|
if (CudaTraits::WarpSize < word_count.value) {
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
} else {
|
||||||
|
// In the above call to final(), shared might have been updated by a
|
||||||
|
// single thread within a warp without synchronization. Synchronize
|
||||||
|
// threads within warp to avoid potential race condition.
|
||||||
|
__syncwarp(0xffffffff);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||||
|
|||||||
@ -83,7 +83,7 @@ class GraphImpl<Kokkos::HIP> {
|
|||||||
hipGraphExec_t m_graph_exec = nullptr;
|
hipGraphExec_t m_graph_exec = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
inline GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
||||||
m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
|
m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
|
||||||
KOKKOS_EXPECTS(m_graph);
|
KOKKOS_EXPECTS(m_graph);
|
||||||
if (m_graph_exec) {
|
if (m_graph_exec) {
|
||||||
@ -92,12 +92,12 @@ GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
|||||||
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph));
|
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph));
|
||||||
}
|
}
|
||||||
|
|
||||||
GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
|
inline GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
|
||||||
: m_execution_space(std::move(instance)) {
|
: m_execution_space(std::move(instance)) {
|
||||||
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0));
|
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
void GraphImpl<Kokkos::HIP>::add_node(
|
inline void GraphImpl<Kokkos::HIP>::add_node(
|
||||||
std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
|
std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
|
||||||
// All of the predecessors are just added as normal, so all we need to
|
// All of the predecessors are just added as normal, so all we need to
|
||||||
// do here is add an empty node
|
// do here is add an empty node
|
||||||
@ -110,7 +110,7 @@ void GraphImpl<Kokkos::HIP>::add_node(
|
|||||||
// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl
|
// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl
|
||||||
// Also requires that the kernel has the graph node tag in it's policy
|
// Also requires that the kernel has the graph node tag in it's policy
|
||||||
template <class NodeImpl>
|
template <class NodeImpl>
|
||||||
void GraphImpl<Kokkos::HIP>::add_node(
|
inline void GraphImpl<Kokkos::HIP>::add_node(
|
||||||
std::shared_ptr<NodeImpl> const& arg_node_ptr) {
|
std::shared_ptr<NodeImpl> const& arg_node_ptr) {
|
||||||
static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value);
|
static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value);
|
||||||
KOKKOS_EXPECTS(arg_node_ptr);
|
KOKKOS_EXPECTS(arg_node_ptr);
|
||||||
@ -129,8 +129,8 @@ void GraphImpl<Kokkos::HIP>::add_node(
|
|||||||
// already been added to this graph and NodeImpl is a specialization of
|
// already been added to this graph and NodeImpl is a specialization of
|
||||||
// GraphNodeImpl that has already been added to this graph.
|
// GraphNodeImpl that has already been added to this graph.
|
||||||
template <class NodeImplPtr, class PredecessorRef>
|
template <class NodeImplPtr, class PredecessorRef>
|
||||||
void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
|
inline void GraphImpl<Kokkos::HIP>::add_predecessor(
|
||||||
PredecessorRef arg_pred_ref) {
|
NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) {
|
||||||
KOKKOS_EXPECTS(arg_node_ptr);
|
KOKKOS_EXPECTS(arg_node_ptr);
|
||||||
auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref);
|
auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref);
|
||||||
KOKKOS_EXPECTS(pred_ptr);
|
KOKKOS_EXPECTS(pred_ptr);
|
||||||
@ -145,7 +145,7 @@ void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
|
|||||||
hipGraphAddDependencies(m_graph, &pred_node, &node, 1));
|
hipGraphAddDependencies(m_graph, &pred_node, &node, 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
void GraphImpl<Kokkos::HIP>::submit() {
|
inline void GraphImpl<Kokkos::HIP>::submit() {
|
||||||
if (!m_graph_exec) {
|
if (!m_graph_exec) {
|
||||||
instantiate_graph();
|
instantiate_graph();
|
||||||
}
|
}
|
||||||
@ -153,12 +153,12 @@ void GraphImpl<Kokkos::HIP>::submit() {
|
|||||||
hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream()));
|
hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
|
inline Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
|
||||||
noexcept {
|
noexcept {
|
||||||
return m_execution_space;
|
return m_execution_space;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
inline auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
||||||
KOKKOS_EXPECTS(m_graph);
|
KOKKOS_EXPECTS(m_graph);
|
||||||
KOKKOS_EXPECTS(!m_graph_exec);
|
KOKKOS_EXPECTS(!m_graph_exec);
|
||||||
auto rv = std::make_shared<root_node_impl_t>(get_execution_space(),
|
auto rv = std::make_shared<root_node_impl_t>(get_execution_space(),
|
||||||
@ -172,7 +172,7 @@ auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
template <class... PredecessorRefs>
|
template <class... PredecessorRefs>
|
||||||
auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
|
inline auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
|
||||||
// The attachment to predecessors, which is all we really need, happens
|
// The attachment to predecessors, which is all we really need, happens
|
||||||
// in the generic layer, which calls through to add_predecessor for
|
// in the generic layer, which calls through to add_predecessor for
|
||||||
// each predecessor ref, so all we need to do here is create the (trivial)
|
// each predecessor ref, so all we need to do here is create the (trivial)
|
||||||
|
|||||||
@ -30,7 +30,8 @@ namespace Impl {
|
|||||||
|
|
||||||
struct HIPTraits {
|
struct HIPTraits {
|
||||||
#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
|
#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
|
||||||
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942)
|
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \
|
||||||
|
defined(KOKKOS_ARCH_AMD_GFX942)
|
||||||
static constexpr int WarpSize = 64;
|
static constexpr int WarpSize = 64;
|
||||||
static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */
|
static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */
|
||||||
static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/
|
static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/
|
||||||
|
|||||||
@ -75,12 +75,19 @@ class HostSpace {
|
|||||||
/**\brief Non-default memory space instance to choose allocation mechansim,
|
/**\brief Non-default memory space instance to choose allocation mechansim,
|
||||||
* if available */
|
* if available */
|
||||||
|
|
||||||
enum KOKKOS_DEPRECATED AllocationMechanism {
|
#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100
|
||||||
STD_MALLOC,
|
// We see deprecation warnings even when not using the deprecated
|
||||||
POSIX_MEMALIGN,
|
// HostSpace constructor below when using gcc before release 11.
|
||||||
POSIX_MMAP,
|
enum
|
||||||
INTEL_MM_ALLOC
|
#else
|
||||||
};
|
enum KOKKOS_DEPRECATED
|
||||||
|
#endif
|
||||||
|
AllocationMechanism {
|
||||||
|
STD_MALLOC,
|
||||||
|
POSIX_MEMALIGN,
|
||||||
|
POSIX_MMAP,
|
||||||
|
INTEL_MM_ALLOC
|
||||||
|
};
|
||||||
|
|
||||||
KOKKOS_DEPRECATED
|
KOKKOS_DEPRECATED
|
||||||
explicit HostSpace(const AllocationMechanism&);
|
explicit HostSpace(const AllocationMechanism&);
|
||||||
|
|||||||
@ -31,7 +31,7 @@ namespace Kokkos {
|
|||||||
// backends. The GPU backends always return 1 and NVHPC only compiles if we
|
// backends. The GPU backends always return 1 and NVHPC only compiles if we
|
||||||
// don't ask for the return value.
|
// don't ask for the return value.
|
||||||
template <typename... Args>
|
template <typename... Args>
|
||||||
KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
|
KOKKOS_FUNCTION void printf(const char* format, Args... args) {
|
||||||
#ifdef KOKKOS_ENABLE_SYCL
|
#ifdef KOKKOS_ENABLE_SYCL
|
||||||
// Some compilers warn if "args" is empty and format is not a string literal
|
// Some compilers warn if "args" is empty and format is not a string literal
|
||||||
if constexpr (sizeof...(Args) == 0)
|
if constexpr (sizeof...(Args) == 0)
|
||||||
|
|||||||
@ -359,8 +359,6 @@ void OpenMPInternal::finalize() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
m_initialized = false;
|
m_initialized = false;
|
||||||
|
|
||||||
Kokkos::Profiling::finalize();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void OpenMPInternal::print_configuration(std::ostream &s) const {
|
void OpenMPInternal::print_configuration(std::ostream &s) const {
|
||||||
|
|||||||
@ -219,6 +219,8 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
|
|||||||
Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
|
Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
|
||||||
partition_size);
|
partition_size);
|
||||||
|
|
||||||
|
OpenMP::memory_space space;
|
||||||
|
|
||||||
#pragma omp parallel num_threads(num_partitions)
|
#pragma omp parallel num_threads(num_partitions)
|
||||||
{
|
{
|
||||||
Exec thread_local_instance(partition_size);
|
Exec thread_local_instance(partition_size);
|
||||||
|
|||||||
@ -58,8 +58,6 @@ void SerialInternal::finalize() {
|
|||||||
m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
|
m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
Kokkos::Profiling::finalize();
|
|
||||||
|
|
||||||
m_is_initialized = false;
|
m_is_initialized = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -30,6 +30,7 @@ static_assert(false,
|
|||||||
|
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <iosfwd>
|
#include <iosfwd>
|
||||||
|
#include <iterator>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <Kokkos_Core_fwd.hpp>
|
#include <Kokkos_Core_fwd.hpp>
|
||||||
|
|||||||
@ -815,8 +815,6 @@ void ThreadsExec::finalize() {
|
|||||||
s_threads_process.m_pool_size = 1;
|
s_threads_process.m_pool_size = 1;
|
||||||
s_threads_process.m_pool_fan_size = 0;
|
s_threads_process.m_pool_fan_size = 0;
|
||||||
s_threads_process.m_pool_state = ThreadsExec::Inactive;
|
s_threads_process.m_pool_state = ThreadsExec::Inactive;
|
||||||
|
|
||||||
Kokkos::Profiling::finalize();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//----------------------------------------------------------------------------
|
//----------------------------------------------------------------------------
|
||||||
|
|||||||
@ -30,8 +30,9 @@
|
|||||||
#define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
|
#define MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined KOKKOS_COMPILER_INTEL || \
|
#if defined KOKKOS_COMPILER_INTEL || \
|
||||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
|
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
|
||||||
|
!defined(KOKKOS_COMPILER_MSVC))
|
||||||
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable();
|
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable();
|
||||||
#else
|
#else
|
||||||
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE
|
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE
|
||||||
@ -394,10 +395,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2);
|
|||||||
DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||||
DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
|
||||||
DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
||||||
DEFINE_UNARY_FUNCTION_EVAL(sin, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(sin, 2);
|
||||||
DEFINE_UNARY_FUNCTION_EVAL(cos, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(cos, 2);
|
||||||
DEFINE_UNARY_FUNCTION_EVAL(tan, 2);
|
DEFINE_UNARY_FUNCTION_EVAL(tan, 2);
|
||||||
@ -483,11 +486,9 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
|
|||||||
}; \
|
}; \
|
||||||
constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
|
constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
|
||||||
|
|
||||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||||
DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
|
DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
|
||||||
DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
|
DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
|
||||||
#endif
|
|
||||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
|
||||||
DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1);
|
DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1);
|
||||||
DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
|
DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
|
||||||
#endif
|
#endif
|
||||||
@ -519,7 +520,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
|
|||||||
}; \
|
}; \
|
||||||
constexpr char math_function_name<MathTernaryFunction_##FUNC>::name[]
|
constexpr char math_function_name<MathTernaryFunction_##FUNC>::name[]
|
||||||
|
|
||||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||||
DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2);
|
DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2);
|
||||||
DEFINE_TERNARY_FUNCTION_EVAL(fma, 2);
|
DEFINE_TERNARY_FUNCTION_EVAL(fma, 2);
|
||||||
#endif
|
#endif
|
||||||
@ -787,7 +788,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
|
|||||||
|
|
||||||
// TODO atan2
|
// TODO atan2
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||||
TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
|
TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
|
||||||
TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11});
|
TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11});
|
||||||
TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l});
|
TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l});
|
||||||
@ -1568,6 +1571,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) {
|
|||||||
|
|
||||||
// TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279
|
// TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279
|
||||||
|
|
||||||
|
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||||
template <class Space>
|
template <class Space>
|
||||||
struct TestIsFinite {
|
struct TestIsFinite {
|
||||||
TestIsFinite() { run(); }
|
TestIsFinite() { run(); }
|
||||||
@ -1591,6 +1595,7 @@ struct TestIsFinite {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isfinite(float)\n");
|
Kokkos::printf("failed isfinite(float)\n");
|
||||||
}
|
}
|
||||||
|
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||||
if (!isfinite(static_cast<KE::half_t>(2.f))
|
if (!isfinite(static_cast<KE::half_t>(2.f))
|
||||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||||
|| isfinite(quiet_NaN<KE::half_t>::value) ||
|
|| isfinite(quiet_NaN<KE::half_t>::value) ||
|
||||||
@ -1611,6 +1616,7 @@ struct TestIsFinite {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isfinite(KE::bhalf_t)\n");
|
Kokkos::printf("failed isfinite(KE::bhalf_t)\n");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
if (!isfinite(3.)
|
if (!isfinite(3.)
|
||||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||||
|| isfinite(quiet_NaN<double>::value) ||
|
|| isfinite(quiet_NaN<double>::value) ||
|
||||||
@ -1670,6 +1676,7 @@ struct TestIsInf {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isinf(float)\n");
|
Kokkos::printf("failed isinf(float)\n");
|
||||||
}
|
}
|
||||||
|
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||||
if (isinf(static_cast<KE::half_t>(2.f))
|
if (isinf(static_cast<KE::half_t>(2.f))
|
||||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||||
|| isinf(quiet_NaN<KE::half_t>::value) ||
|
|| isinf(quiet_NaN<KE::half_t>::value) ||
|
||||||
@ -1690,6 +1697,7 @@ struct TestIsInf {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isinf(KE::bhalf_t)\n");
|
Kokkos::printf("failed isinf(KE::bhalf_t)\n");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
if (isinf(3.)
|
if (isinf(3.)
|
||||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||||
|| isinf(quiet_NaN<double>::value) ||
|
|| isinf(quiet_NaN<double>::value) ||
|
||||||
@ -1748,6 +1756,7 @@ struct TestIsNaN {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isnan(float)\n");
|
Kokkos::printf("failed isnan(float)\n");
|
||||||
}
|
}
|
||||||
|
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||||
if (isnan(static_cast<KE::half_t>(2.f))
|
if (isnan(static_cast<KE::half_t>(2.f))
|
||||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||||
|| !isnan(quiet_NaN<KE::half_t>::value) ||
|
|| !isnan(quiet_NaN<KE::half_t>::value) ||
|
||||||
@ -1777,6 +1786,7 @@ struct TestIsNaN {
|
|||||||
++e;
|
++e;
|
||||||
Kokkos::printf("failed isnan(double)\n");
|
Kokkos::printf("failed isnan(double)\n");
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
|
#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
|
||||||
if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) ||
|
if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) ||
|
||||||
!isnan(signaling_NaN<long double>::value) ||
|
!isnan(signaling_NaN<long double>::value) ||
|
||||||
@ -1803,6 +1813,7 @@ struct TestIsNaN {
|
|||||||
TEST(TEST_CATEGORY, mathematical_functions_isnan) {
|
TEST(TEST_CATEGORY, mathematical_functions_isnan) {
|
||||||
TestIsNaN<TEST_EXECSPACE>();
|
TestIsNaN<TEST_EXECSPACE>();
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279
|
// TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -110,8 +110,8 @@ struct TestNumericTraits {
|
|||||||
|
|
||||||
KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const {
|
KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const {
|
||||||
using Kokkos::Experimental::epsilon;
|
using Kokkos::Experimental::epsilon;
|
||||||
auto const eps = epsilon<T>::value;
|
T const eps = epsilon<T>::value;
|
||||||
auto const one = T(1);
|
T const one = 1;
|
||||||
// Avoid higher precision intermediate representation
|
// Avoid higher precision intermediate representation
|
||||||
compare() = one + eps;
|
compare() = one + eps;
|
||||||
e += (int)!(compare() != one);
|
e += (int)!(compare() != one);
|
||||||
|
|||||||
@ -160,6 +160,7 @@ display_help_text() {
|
|||||||
echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906"
|
echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906"
|
||||||
echo " AMD_GFX908 = AMD GPU MI100 GFX908"
|
echo " AMD_GFX908 = AMD GPU MI100 GFX908"
|
||||||
echo " AMD_GFX90A = AMD GPU MI200 GFX90A"
|
echo " AMD_GFX90A = AMD GPU MI200 GFX90A"
|
||||||
|
echo " AMD_GFX940 = AMD GPU MI300 GFX940"
|
||||||
echo " AMD_GFX942 = AMD GPU MI300 GFX942"
|
echo " AMD_GFX942 = AMD GPU MI300 GFX942"
|
||||||
echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030"
|
echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030"
|
||||||
echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100"
|
echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100"
|
||||||
|
|||||||
@ -34,3 +34,4 @@ tag: 4.0.00 date: 02:23:2023 master: 5ad60966 release: 52ea2953
|
|||||||
tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f
|
tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f
|
||||||
tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a
|
tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a
|
||||||
tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88
|
tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88
|
||||||
|
tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a
|
||||||
|
|||||||
@ -224,7 +224,7 @@ template <typename T>
|
|||||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||||
Kokkos::floor(static_cast<data_type>(a[0])));
|
Kokkos::floor(static_cast<data_type>(a[0])));
|
||||||
};
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil(
|
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil(
|
||||||
@ -232,7 +232,7 @@ template <typename T>
|
|||||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||||
Kokkos::ceil(static_cast<data_type>(a[0])));
|
Kokkos::ceil(static_cast<data_type>(a[0])));
|
||||||
};
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round(
|
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round(
|
||||||
@ -240,7 +240,7 @@ template <typename T>
|
|||||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||||
Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0])));
|
Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0])));
|
||||||
};
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc(
|
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc(
|
||||||
@ -248,7 +248,7 @@ template <typename T>
|
|||||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||||
Kokkos::trunc(static_cast<data_type>(a[0])));
|
Kokkos::trunc(static_cast<data_type>(a[0])));
|
||||||
};
|
}
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
|
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
|
||||||
|
|||||||
@ -42,6 +42,7 @@ inline void host_check_gen_ctor() {
|
|||||||
simd_type blend;
|
simd_type blend;
|
||||||
blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
|
blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
|
||||||
|
|
||||||
|
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||||
if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
|
if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
|
||||||
simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
|
simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
|
||||||
host_check_equality(basic, rhs, lanes);
|
host_check_equality(basic, rhs, lanes);
|
||||||
@ -63,6 +64,7 @@ inline void host_check_gen_ctor() {
|
|||||||
|
|
||||||
host_check_equality(blend, result, lanes);
|
host_check_equality(blend, result, lanes);
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename Abi, typename... DataTypes>
|
template <typename Abi, typename... DataTypes>
|
||||||
|
|||||||
Reference in New Issue
Block a user