Update Kokkos library in LAMMPS to v4.2.1

This commit is contained in:
Stan Gerald Moore
2024-03-06 14:58:06 -07:00
parent 9d9dbc1fa8
commit 6cc9a4c7b7
32 changed files with 121 additions and 64 deletions

View File

@ -1,5 +1,26 @@
# CHANGELOG
## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01)
### Backend and Architecture Enhancements:
#### CUDA:
- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746)
#### HIP:
- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624)
- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671)
### Build System
- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742)
### Bug Fixes
- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653)
- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635)
- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659)
- Fix "extra `;`" warning with `-pedantic` flag in `<Kokkos_SIMD_Scalar.hpp>` [\#6510](https://github.com/kokkos/kokkos/pull/6510)
## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00)

View File

@ -151,7 +151,7 @@ ENDIF()
set(Kokkos_VERSION_MAJOR 4)
set(Kokkos_VERSION_MINOR 2)
set(Kokkos_VERSION_PATCH 0)
set(Kokkos_VERSION_PATCH 1)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
@ -252,7 +252,6 @@ ENDIF()
# subpackages
## This restores the old behavior of ProjectCompilerPostConfig.cmake
# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos
# We must do this before KOKKOS_PACKAGE_DECL
IF (KOKKOS_HAS_TRILINOS)
# Overwrite the old flags at the top-level
@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS)
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
ENDFOREACH()
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
IF (KOKKOS_ENABLE_CUDA)
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
ENDFOREACH()
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}")
ENDIF()
# Both parent scope and this package
# In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in
# TRILINOS_TOPLEVEL_CXX_FLAGS
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE)
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}")
#CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here
#These flags get set up in KOKKOS_PACKAGE_DECL, which means they
#must be configured before KOKKOS_PACKAGE_DECL
SET(KOKKOS_ALL_COMPILE_OPTIONS

View File

@ -12,7 +12,7 @@ endif
KOKKOS_VERSION_MAJOR = 4
KOKKOS_VERSION_MINOR = 2
KOKKOS_VERSION_PATCH = 0
KOKKOS_VERSION_PATCH = 1
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP"
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
KOKKOS_ARCH ?= ""
@ -416,6 +416,8 @@ endif
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940)
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942)
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
@ -1113,6 +1115,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")

View File

@ -199,7 +199,8 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) {
// this is needed for intel to avoid
// error #1011: missing return statement at end of non-void function
#if defined KOKKOS_COMPILER_INTEL || \
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
!defined(KOKKOS_COMPILER_MSVC))
__builtin_unreachable();
#endif
}

View File

@ -139,7 +139,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
numTeams, numCols);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());

View File

@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
// -----------------------------------------------
auto returnView_h = create_host_space_copy(returnView);
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());

View File

@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
"stdDestTrueView", numTeams, numCols);
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView(
"stdDestFalseView", numTeams, numCols);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());

View File

@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
auto distancesView_h = create_host_space_copy(distancesView);
auto dataViewAfterOp_h = create_host_space_copy(dataView);
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());

View File

@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
numTeams, numCols);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
auto rowFrom =
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());

View File

@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
// -----------------------------------------------
// check against std
// -----------------------------------------------
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
auto dataViewAfterOp_h = create_host_space_copy(dataView);
auto distancesView_h = create_host_space_copy(distancesView);
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);

View File

@ -145,7 +145,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
numTeams, numCols);
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
auto rowFrom =
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());

View File

@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j);
}
}
GreaterThanValueFunctor predicate(threshold);
GreaterThanValueFunctor<ValueType> predicate(threshold);
for (std::size_t i = 0; i < dataView.extent(0); ++i) {
auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL());
std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal);

View File

@ -114,6 +114,7 @@
#cmakedefine KOKKOS_ARCH_AMD_GFX906
#cmakedefine KOKKOS_ARCH_AMD_GFX908
#cmakedefine KOKKOS_ARCH_AMD_GFX90A
#cmakedefine KOKKOS_ARCH_AMD_GFX940
#cmakedefine KOKKOS_ARCH_AMD_GFX942
#cmakedefine KOKKOS_ARCH_AMD_GFX1030
#cmakedefine KOKKOS_ARCH_AMD_GFX1100

View File

@ -94,9 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K
ENDIF()
# AMD archs ordered in decreasing priority of autodetection
LIST(APPEND SUPPORTED_AMD_GPUS MI300)
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942)
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942)
LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300)
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940)
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940)
LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100)
LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908)
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908)

View File

@ -309,6 +309,11 @@ class ParallelReduce<CombinedFunctorReducerType,
if (CudaTraits::WarpSize < word_count.value) {
__syncthreads();
} else {
// In the above call to final(), shared might have been updated by a
// single thread within a warp without synchronization. Synchronize
// threads within warp to avoid potential race condition.
__syncwarp(0xffffffff);
}
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {

View File

@ -243,6 +243,12 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
if (CudaTraits::WarpSize < word_count.value) {
__syncthreads();
} else if (word_count.value > 1) {
// Inside cuda_single_inter_block_reduce_scan() above, shared[i] below
// might have been updated by a single thread within a warp without
// synchronization afterwards. Synchronize threads within warp to avoid
// potential racecondition.
__syncwarp(0xffffffff);
}
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {

View File

@ -742,6 +742,11 @@ class ParallelReduce<CombinedFunctorReducerType,
if (CudaTraits::WarpSize < word_count.value) {
__syncthreads();
} else {
// In the above call to final(), shared might have been updated by a
// single thread within a warp without synchronization. Synchronize
// threads within warp to avoid potential race condition.
__syncwarp(0xffffffff);
}
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {

View File

@ -83,7 +83,7 @@ class GraphImpl<Kokkos::HIP> {
hipGraphExec_t m_graph_exec = nullptr;
};
GraphImpl<Kokkos::HIP>::~GraphImpl() {
inline GraphImpl<Kokkos::HIP>::~GraphImpl() {
m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
KOKKOS_EXPECTS(m_graph);
if (m_graph_exec) {
@ -92,12 +92,12 @@ GraphImpl<Kokkos::HIP>::~GraphImpl() {
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph));
}
GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
inline GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
: m_execution_space(std::move(instance)) {
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0));
}
void GraphImpl<Kokkos::HIP>::add_node(
inline void GraphImpl<Kokkos::HIP>::add_node(
std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
// All of the predecessors are just added as normal, so all we need to
// do here is add an empty node
@ -110,7 +110,7 @@ void GraphImpl<Kokkos::HIP>::add_node(
// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl
// Also requires that the kernel has the graph node tag in it's policy
template <class NodeImpl>
void GraphImpl<Kokkos::HIP>::add_node(
inline void GraphImpl<Kokkos::HIP>::add_node(
std::shared_ptr<NodeImpl> const& arg_node_ptr) {
static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value);
KOKKOS_EXPECTS(arg_node_ptr);
@ -129,8 +129,8 @@ void GraphImpl<Kokkos::HIP>::add_node(
// already been added to this graph and NodeImpl is a specialization of
// GraphNodeImpl that has already been added to this graph.
template <class NodeImplPtr, class PredecessorRef>
void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
PredecessorRef arg_pred_ref) {
inline void GraphImpl<Kokkos::HIP>::add_predecessor(
NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) {
KOKKOS_EXPECTS(arg_node_ptr);
auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref);
KOKKOS_EXPECTS(pred_ptr);
@ -145,7 +145,7 @@ void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
hipGraphAddDependencies(m_graph, &pred_node, &node, 1));
}
void GraphImpl<Kokkos::HIP>::submit() {
inline void GraphImpl<Kokkos::HIP>::submit() {
if (!m_graph_exec) {
instantiate_graph();
}
@ -153,12 +153,12 @@ void GraphImpl<Kokkos::HIP>::submit() {
hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream()));
}
Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
inline Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
noexcept {
return m_execution_space;
}
auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
inline auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
KOKKOS_EXPECTS(m_graph);
KOKKOS_EXPECTS(!m_graph_exec);
auto rv = std::make_shared<root_node_impl_t>(get_execution_space(),
@ -172,7 +172,7 @@ auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
}
template <class... PredecessorRefs>
auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
inline auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
// The attachment to predecessors, which is all we really need, happens
// in the generic layer, which calls through to add_predecessor for
// each predecessor ref, so all we need to do here is create the (trivial)

View File

@ -30,7 +30,8 @@ namespace Impl {
struct HIPTraits {
#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942)
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \
defined(KOKKOS_ARCH_AMD_GFX942)
static constexpr int WarpSize = 64;
static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */
static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/

View File

@ -75,7 +75,14 @@ class HostSpace {
/**\brief Non-default memory space instance to choose allocation mechansim,
* if available */
enum KOKKOS_DEPRECATED AllocationMechanism {
#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100
// We see deprecation warnings even when not using the deprecated
// HostSpace constructor below when using gcc before release 11.
enum
#else
enum KOKKOS_DEPRECATED
#endif
AllocationMechanism {
STD_MALLOC,
POSIX_MEMALIGN,
POSIX_MMAP,

View File

@ -31,7 +31,7 @@ namespace Kokkos {
// backends. The GPU backends always return 1 and NVHPC only compiles if we
// don't ask for the return value.
template <typename... Args>
KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
KOKKOS_FUNCTION void printf(const char* format, Args... args) {
#ifdef KOKKOS_ENABLE_SYCL
// Some compilers warn if "args" is empty and format is not a string literal
if constexpr (sizeof...(Args) == 0)

View File

@ -359,8 +359,6 @@ void OpenMPInternal::finalize() {
}
m_initialized = false;
Kokkos::Profiling::finalize();
}
void OpenMPInternal::print_configuration(std::ostream &s) const {

View File

@ -219,6 +219,8 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
partition_size);
OpenMP::memory_space space;
#pragma omp parallel num_threads(num_partitions)
{
Exec thread_local_instance(partition_size);

View File

@ -58,8 +58,6 @@ void SerialInternal::finalize() {
m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
}
Kokkos::Profiling::finalize();
m_is_initialized = false;
}

View File

@ -30,6 +30,7 @@ static_assert(false,
#include <cstddef>
#include <iosfwd>
#include <iterator>
#include <mutex>
#include <thread>
#include <Kokkos_Core_fwd.hpp>

View File

@ -815,8 +815,6 @@ void ThreadsExec::finalize() {
s_threads_process.m_pool_size = 1;
s_threads_process.m_pool_fan_size = 0;
s_threads_process.m_pool_state = ThreadsExec::Inactive;
Kokkos::Profiling::finalize();
}
//----------------------------------------------------------------------------

View File

@ -31,7 +31,8 @@
#endif
#if defined KOKKOS_COMPILER_INTEL || \
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
!defined(KOKKOS_COMPILER_MSVC))
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable();
#else
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE
@ -394,10 +395,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2);
DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
#endif
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
#endif
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
DEFINE_UNARY_FUNCTION_EVAL(sin, 2);
DEFINE_UNARY_FUNCTION_EVAL(cos, 2);
DEFINE_UNARY_FUNCTION_EVAL(tan, 2);
@ -483,11 +486,9 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
}; \
constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
#endif
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1);
DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
#endif
@ -519,7 +520,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
}; \
constexpr char math_function_name<MathTernaryFunction_##FUNC>::name[]
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2);
DEFINE_TERNARY_FUNCTION_EVAL(fma, 2);
#endif
@ -787,7 +788,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
// TODO atan2
}
#endif
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11});
TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l});
@ -1568,6 +1571,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) {
// TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
template <class Space>
struct TestIsFinite {
TestIsFinite() { run(); }
@ -1591,6 +1595,7 @@ struct TestIsFinite {
++e;
Kokkos::printf("failed isfinite(float)\n");
}
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
if (!isfinite(static_cast<KE::half_t>(2.f))
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|| isfinite(quiet_NaN<KE::half_t>::value) ||
@ -1611,6 +1616,7 @@ struct TestIsFinite {
++e;
Kokkos::printf("failed isfinite(KE::bhalf_t)\n");
}
#endif
if (!isfinite(3.)
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|| isfinite(quiet_NaN<double>::value) ||
@ -1670,6 +1676,7 @@ struct TestIsInf {
++e;
Kokkos::printf("failed isinf(float)\n");
}
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
if (isinf(static_cast<KE::half_t>(2.f))
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|| isinf(quiet_NaN<KE::half_t>::value) ||
@ -1690,6 +1697,7 @@ struct TestIsInf {
++e;
Kokkos::printf("failed isinf(KE::bhalf_t)\n");
}
#endif
if (isinf(3.)
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|| isinf(quiet_NaN<double>::value) ||
@ -1748,6 +1756,7 @@ struct TestIsNaN {
++e;
Kokkos::printf("failed isnan(float)\n");
}
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
if (isnan(static_cast<KE::half_t>(2.f))
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|| !isnan(quiet_NaN<KE::half_t>::value) ||
@ -1777,6 +1786,7 @@ struct TestIsNaN {
++e;
Kokkos::printf("failed isnan(double)\n");
}
#endif
#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) ||
!isnan(signaling_NaN<long double>::value) ||
@ -1803,6 +1813,7 @@ struct TestIsNaN {
TEST(TEST_CATEGORY, mathematical_functions_isnan) {
TestIsNaN<TEST_EXECSPACE>();
}
#endif
// TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279
#endif

View File

@ -110,8 +110,8 @@ struct TestNumericTraits {
KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const {
using Kokkos::Experimental::epsilon;
auto const eps = epsilon<T>::value;
auto const one = T(1);
T const eps = epsilon<T>::value;
T const one = 1;
// Avoid higher precision intermediate representation
compare() = one + eps;
e += (int)!(compare() != one);

View File

@ -160,6 +160,7 @@ display_help_text() {
echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906"
echo " AMD_GFX908 = AMD GPU MI100 GFX908"
echo " AMD_GFX90A = AMD GPU MI200 GFX90A"
echo " AMD_GFX940 = AMD GPU MI300 GFX940"
echo " AMD_GFX942 = AMD GPU MI300 GFX942"
echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030"
echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100"

View File

@ -34,3 +34,4 @@ tag: 4.0.00 date: 02:23:2023 master: 5ad60966 release: 52ea2953
tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f
tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a
tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88
tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a

View File

@ -224,7 +224,7 @@ template <typename T>
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
Kokkos::floor(static_cast<data_type>(a[0])));
};
}
template <typename T>
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil(
@ -232,7 +232,7 @@ template <typename T>
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
Kokkos::ceil(static_cast<data_type>(a[0])));
};
}
template <typename T>
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round(
@ -240,7 +240,7 @@ template <typename T>
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0])));
};
}
template <typename T>
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc(
@ -248,7 +248,7 @@ template <typename T>
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
Kokkos::trunc(static_cast<data_type>(a[0])));
};
}
template <class T>
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION

View File

@ -42,6 +42,7 @@ inline void host_check_gen_ctor() {
simd_type blend;
blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
host_check_equality(basic, rhs, lanes);
@ -63,6 +64,7 @@ inline void host_check_gen_ctor() {
host_check_equality(blend, result, lanes);
}
#endif
}
template <typename Abi, typename... DataTypes>