Update Kokkos library in LAMMPS to v4.2.1
This commit is contained in:
@ -1,5 +1,26 @@
|
||||
# CHANGELOG
|
||||
|
||||
## [4.2.01](https://github.com/kokkos/kokkos/tree/4.2.01) (2023-12-07)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.2.00...4.2.01)
|
||||
|
||||
### Backend and Architecture Enhancements:
|
||||
|
||||
#### CUDA:
|
||||
- Add warp sync for `parallel_reduce` to avoid race condition [\#6630](https://github.com/kokkos/kokkos/pull/6630), [\#6746](https://github.com/kokkos/kokkos/pull/6746)
|
||||
|
||||
#### HIP:
|
||||
- Fix Graph "multiple definition of" linking error (missing `inline` specifier) [\#6624](https://github.com/kokkos/kokkos/pull/6624)
|
||||
- Add support for gfx940 (AMD Instinct MI300 GPU) [\#6671](https://github.com/kokkos/kokkos/pull/6671)
|
||||
|
||||
### Build System
|
||||
- CMake: Don't let Kokkos set `CMAKE_CXX_FLAGS` for Trilinos builds [\#6742](https://github.com/kokkos/kokkos/pull/6742)
|
||||
|
||||
### Bug Fixes
|
||||
- Remove deprecation warning for `AllocationMechanism` for GCC <11.0 [\#6653](https://github.com/kokkos/kokkos/pull/6653)
|
||||
- Fix bug early tools finalize with non-default host execution instances [\#6635](https://github.com/kokkos/kokkos/pull/6635)
|
||||
- Fix various issues for MSVC CUDA builds [\#6659](https://github.com/kokkos/kokkos/pull/6659)
|
||||
- Fix "extra `;`" warning with `-pedantic` flag in `<Kokkos_SIMD_Scalar.hpp>` [\#6510](https://github.com/kokkos/kokkos/pull/6510)
|
||||
|
||||
## [4.2.00](https://github.com/kokkos/kokkos/tree/4.2.00) (2023-11-06)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.1.00...4.2.00)
|
||||
|
||||
|
||||
@ -151,7 +151,7 @@ ENDIF()
|
||||
|
||||
set(Kokkos_VERSION_MAJOR 4)
|
||||
set(Kokkos_VERSION_MINOR 2)
|
||||
set(Kokkos_VERSION_PATCH 0)
|
||||
set(Kokkos_VERSION_PATCH 1)
|
||||
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
|
||||
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
|
||||
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
|
||||
@ -252,7 +252,6 @@ ENDIF()
|
||||
# subpackages
|
||||
|
||||
## This restores the old behavior of ProjectCompilerPostConfig.cmake
|
||||
# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos
|
||||
# We must do this before KOKKOS_PACKAGE_DECL
|
||||
IF (KOKKOS_HAS_TRILINOS)
|
||||
# Overwrite the old flags at the top-level
|
||||
@ -280,21 +279,13 @@ IF (KOKKOS_HAS_TRILINOS)
|
||||
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
|
||||
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
|
||||
ENDFOREACH()
|
||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
|
||||
IF (KOKKOS_ENABLE_CUDA)
|
||||
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
|
||||
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
|
||||
SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
|
||||
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
|
||||
ENDFOREACH()
|
||||
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}")
|
||||
ENDIF()
|
||||
# Both parent scope and this package
|
||||
# In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in
|
||||
# TRILINOS_TOPLEVEL_CXX_FLAGS
|
||||
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE)
|
||||
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}")
|
||||
#CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here
|
||||
#These flags get set up in KOKKOS_PACKAGE_DECL, which means they
|
||||
#must be configured before KOKKOS_PACKAGE_DECL
|
||||
SET(KOKKOS_ALL_COMPILE_OPTIONS
|
||||
|
||||
@ -12,7 +12,7 @@ endif
|
||||
|
||||
KOKKOS_VERSION_MAJOR = 4
|
||||
KOKKOS_VERSION_MINOR = 2
|
||||
KOKKOS_VERSION_PATCH = 0
|
||||
KOKKOS_VERSION_PATCH = 1
|
||||
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
|
||||
|
||||
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
|
||||
@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP"
|
||||
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90
|
||||
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
|
||||
# IBM: BGQ,Power7,Power8,Power9
|
||||
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX942,GFX1030,GFX1100
|
||||
# AMD-GPUS: GFX906,GFX908,GFX90A,GFX940,GFX942,GFX1030,GFX1100
|
||||
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
|
||||
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
|
||||
KOKKOS_ARCH ?= ""
|
||||
@ -416,6 +416,8 @@ endif
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA906),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX906))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA908),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX908))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),VEGA90A),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX90A))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942)
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(or $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100),$(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1100))
|
||||
|
||||
@ -1113,6 +1115,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1)
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942")
|
||||
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU")
|
||||
|
||||
@ -199,7 +199,8 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) {
|
||||
// this is needed for intel to avoid
|
||||
// error #1011: missing return statement at end of non-void function
|
||||
#if defined KOKKOS_COMPILER_INTEL || \
|
||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
|
||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
|
||||
!defined(KOKKOS_COMPILER_MSVC))
|
||||
__builtin_unreachable();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -139,7 +139,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||
numTeams, numCols);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
||||
auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||
auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL());
|
||||
|
||||
@ -191,7 +191,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
||||
// -----------------------------------------------
|
||||
auto returnView_h = create_host_space_copy(returnView);
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
|
||||
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
||||
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
||||
|
||||
@ -240,7 +240,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
||||
"stdDestTrueView", numTeams, numCols);
|
||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestFalseView(
|
||||
"stdDestFalseView", numTeams, numCols);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
|
||||
for (std::size_t i = 0; i < sourceView_dc_h.extent(0); ++i) {
|
||||
auto myRowSource = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL());
|
||||
|
||||
@ -197,7 +197,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId,
|
||||
auto distancesView_h = create_host_space_copy(distancesView);
|
||||
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
|
||||
for (std::size_t i = 0; i < dataView_dc_h.extent(0); ++i) {
|
||||
auto myRow = Kokkos::subview(dataView_dc_h, i, Kokkos::ALL());
|
||||
|
||||
@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||
numTeams, numCols);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
for (std::size_t i = 0; i < destViewAfterOp_h.extent(0); ++i) {
|
||||
auto rowFrom =
|
||||
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||
|
||||
@ -127,7 +127,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
||||
// -----------------------------------------------
|
||||
// check against std
|
||||
// -----------------------------------------------
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
auto dataViewAfterOp_h = create_host_space_copy(dataView);
|
||||
auto distancesView_h = create_host_space_copy(distancesView);
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
|
||||
@ -145,7 +145,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
||||
auto intraTeamSentinelView_h = create_host_space_copy(intraTeamSentinelView);
|
||||
Kokkos::View<ValueType**, Kokkos::HostSpace> stdDestView("stdDestView",
|
||||
numTeams, numCols);
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
for (std::size_t i = 0; i < sourceView.extent(0); ++i) {
|
||||
auto rowFrom =
|
||||
Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL());
|
||||
|
||||
@ -103,7 +103,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) {
|
||||
stdDataView(i, j) = cloneOfDataViewBeforeOp_h(i, j);
|
||||
}
|
||||
}
|
||||
GreaterThanValueFunctor predicate(threshold);
|
||||
GreaterThanValueFunctor<ValueType> predicate(threshold);
|
||||
for (std::size_t i = 0; i < dataView.extent(0); ++i) {
|
||||
auto thisRow = Kokkos::subview(stdDataView, i, Kokkos::ALL());
|
||||
std::replace_if(KE::begin(thisRow), KE::end(thisRow), predicate, newVal);
|
||||
|
||||
@ -114,6 +114,7 @@
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX906
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX908
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX90A
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX940
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX942
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX1030
|
||||
#cmakedefine KOKKOS_ARCH_AMD_GFX1100
|
||||
|
||||
@ -94,9 +94,9 @@ IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR K
|
||||
ENDIF()
|
||||
|
||||
# AMD archs ordered in decreasing priority of autodetection
|
||||
LIST(APPEND SUPPORTED_AMD_GPUS MI300)
|
||||
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942)
|
||||
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942)
|
||||
LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300)
|
||||
LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940)
|
||||
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940)
|
||||
LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100)
|
||||
LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908)
|
||||
LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908)
|
||||
|
||||
@ -309,6 +309,11 @@ class ParallelReduce<CombinedFunctorReducerType,
|
||||
|
||||
if (CudaTraits::WarpSize < word_count.value) {
|
||||
__syncthreads();
|
||||
} else {
|
||||
// In the above call to final(), shared might have been updated by a
|
||||
// single thread within a warp without synchronization. Synchronize
|
||||
// threads within warp to avoid potential race condition.
|
||||
__syncwarp(0xffffffff);
|
||||
}
|
||||
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||
|
||||
@ -243,6 +243,12 @@ class ParallelReduce<CombinedFunctorReducerType, Kokkos::RangePolicy<Traits...>,
|
||||
|
||||
if (CudaTraits::WarpSize < word_count.value) {
|
||||
__syncthreads();
|
||||
} else if (word_count.value > 1) {
|
||||
// Inside cuda_single_inter_block_reduce_scan() above, shared[i] below
|
||||
// might have been updated by a single thread within a warp without
|
||||
// synchronization afterwards. Synchronize threads within warp to avoid
|
||||
// potential racecondition.
|
||||
__syncwarp(0xffffffff);
|
||||
}
|
||||
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||
|
||||
@ -742,6 +742,11 @@ class ParallelReduce<CombinedFunctorReducerType,
|
||||
|
||||
if (CudaTraits::WarpSize < word_count.value) {
|
||||
__syncthreads();
|
||||
} else {
|
||||
// In the above call to final(), shared might have been updated by a
|
||||
// single thread within a warp without synchronization. Synchronize
|
||||
// threads within warp to avoid potential race condition.
|
||||
__syncwarp(0xffffffff);
|
||||
}
|
||||
|
||||
for (unsigned i = threadIdx.y; i < word_count.value; i += blockDim.y) {
|
||||
|
||||
@ -83,7 +83,7 @@ class GraphImpl<Kokkos::HIP> {
|
||||
hipGraphExec_t m_graph_exec = nullptr;
|
||||
};
|
||||
|
||||
GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
||||
inline GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
||||
m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction");
|
||||
KOKKOS_EXPECTS(m_graph);
|
||||
if (m_graph_exec) {
|
||||
@ -92,12 +92,12 @@ GraphImpl<Kokkos::HIP>::~GraphImpl() {
|
||||
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphDestroy(m_graph));
|
||||
}
|
||||
|
||||
GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
|
||||
inline GraphImpl<Kokkos::HIP>::GraphImpl(Kokkos::HIP instance)
|
||||
: m_execution_space(std::move(instance)) {
|
||||
KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphCreate(&m_graph, 0));
|
||||
}
|
||||
|
||||
void GraphImpl<Kokkos::HIP>::add_node(
|
||||
inline void GraphImpl<Kokkos::HIP>::add_node(
|
||||
std::shared_ptr<aggregate_node_impl_t> const& arg_node_ptr) {
|
||||
// All of the predecessors are just added as normal, so all we need to
|
||||
// do here is add an empty node
|
||||
@ -110,7 +110,7 @@ void GraphImpl<Kokkos::HIP>::add_node(
|
||||
// Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl
|
||||
// Also requires that the kernel has the graph node tag in it's policy
|
||||
template <class NodeImpl>
|
||||
void GraphImpl<Kokkos::HIP>::add_node(
|
||||
inline void GraphImpl<Kokkos::HIP>::add_node(
|
||||
std::shared_ptr<NodeImpl> const& arg_node_ptr) {
|
||||
static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value);
|
||||
KOKKOS_EXPECTS(arg_node_ptr);
|
||||
@ -129,8 +129,8 @@ void GraphImpl<Kokkos::HIP>::add_node(
|
||||
// already been added to this graph and NodeImpl is a specialization of
|
||||
// GraphNodeImpl that has already been added to this graph.
|
||||
template <class NodeImplPtr, class PredecessorRef>
|
||||
void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
|
||||
PredecessorRef arg_pred_ref) {
|
||||
inline void GraphImpl<Kokkos::HIP>::add_predecessor(
|
||||
NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) {
|
||||
KOKKOS_EXPECTS(arg_node_ptr);
|
||||
auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref);
|
||||
KOKKOS_EXPECTS(pred_ptr);
|
||||
@ -145,7 +145,7 @@ void GraphImpl<Kokkos::HIP>::add_predecessor(NodeImplPtr arg_node_ptr,
|
||||
hipGraphAddDependencies(m_graph, &pred_node, &node, 1));
|
||||
}
|
||||
|
||||
void GraphImpl<Kokkos::HIP>::submit() {
|
||||
inline void GraphImpl<Kokkos::HIP>::submit() {
|
||||
if (!m_graph_exec) {
|
||||
instantiate_graph();
|
||||
}
|
||||
@ -153,12 +153,12 @@ void GraphImpl<Kokkos::HIP>::submit() {
|
||||
hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream()));
|
||||
}
|
||||
|
||||
Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
|
||||
inline Kokkos::HIP const& GraphImpl<Kokkos::HIP>::get_execution_space() const
|
||||
noexcept {
|
||||
return m_execution_space;
|
||||
}
|
||||
|
||||
auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
||||
inline auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
||||
KOKKOS_EXPECTS(m_graph);
|
||||
KOKKOS_EXPECTS(!m_graph_exec);
|
||||
auto rv = std::make_shared<root_node_impl_t>(get_execution_space(),
|
||||
@ -172,7 +172,7 @@ auto GraphImpl<Kokkos::HIP>::create_root_node_ptr() {
|
||||
}
|
||||
|
||||
template <class... PredecessorRefs>
|
||||
auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
|
||||
inline auto GraphImpl<Kokkos::HIP>::create_aggregate_ptr(PredecessorRefs&&...) {
|
||||
// The attachment to predecessors, which is all we really need, happens
|
||||
// in the generic layer, which calls through to add_predecessor for
|
||||
// each predecessor ref, so all we need to do here is create the (trivial)
|
||||
|
||||
@ -30,7 +30,8 @@ namespace Impl {
|
||||
|
||||
struct HIPTraits {
|
||||
#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \
|
||||
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX942)
|
||||
defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \
|
||||
defined(KOKKOS_ARCH_AMD_GFX942)
|
||||
static constexpr int WarpSize = 64;
|
||||
static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */
|
||||
static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/
|
||||
|
||||
@ -75,7 +75,14 @@ class HostSpace {
|
||||
/**\brief Non-default memory space instance to choose allocation mechansim,
|
||||
* if available */
|
||||
|
||||
enum KOKKOS_DEPRECATED AllocationMechanism {
|
||||
#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100
|
||||
// We see deprecation warnings even when not using the deprecated
|
||||
// HostSpace constructor below when using gcc before release 11.
|
||||
enum
|
||||
#else
|
||||
enum KOKKOS_DEPRECATED
|
||||
#endif
|
||||
AllocationMechanism {
|
||||
STD_MALLOC,
|
||||
POSIX_MEMALIGN,
|
||||
POSIX_MMAP,
|
||||
|
||||
@ -31,7 +31,7 @@ namespace Kokkos {
|
||||
// backends. The GPU backends always return 1 and NVHPC only compiles if we
|
||||
// don't ask for the return value.
|
||||
template <typename... Args>
|
||||
KOKKOS_FORCEINLINE_FUNCTION void printf(const char* format, Args... args) {
|
||||
KOKKOS_FUNCTION void printf(const char* format, Args... args) {
|
||||
#ifdef KOKKOS_ENABLE_SYCL
|
||||
// Some compilers warn if "args" is empty and format is not a string literal
|
||||
if constexpr (sizeof...(Args) == 0)
|
||||
|
||||
@ -359,8 +359,6 @@ void OpenMPInternal::finalize() {
|
||||
}
|
||||
|
||||
m_initialized = false;
|
||||
|
||||
Kokkos::Profiling::finalize();
|
||||
}
|
||||
|
||||
void OpenMPInternal::print_configuration(std::ostream &s) const {
|
||||
|
||||
@ -219,6 +219,8 @@ KOKKOS_DEPRECATED void OpenMP::partition_master(F const& f, int num_partitions,
|
||||
Exec::validate_partition_impl(prev_instance->m_pool_size, num_partitions,
|
||||
partition_size);
|
||||
|
||||
OpenMP::memory_space space;
|
||||
|
||||
#pragma omp parallel num_threads(num_partitions)
|
||||
{
|
||||
Exec thread_local_instance(partition_size);
|
||||
|
||||
@ -58,8 +58,6 @@ void SerialInternal::finalize() {
|
||||
m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
Kokkos::Profiling::finalize();
|
||||
|
||||
m_is_initialized = false;
|
||||
}
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ static_assert(false,
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <iterator>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
@ -815,8 +815,6 @@ void ThreadsExec::finalize() {
|
||||
s_threads_process.m_pool_size = 1;
|
||||
s_threads_process.m_pool_fan_size = 0;
|
||||
s_threads_process.m_pool_state = ThreadsExec::Inactive;
|
||||
|
||||
Kokkos::Profiling::finalize();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -31,7 +31,8 @@
|
||||
#endif
|
||||
|
||||
#if defined KOKKOS_COMPILER_INTEL || \
|
||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130)
|
||||
(defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \
|
||||
!defined(KOKKOS_COMPILER_MSVC))
|
||||
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE __builtin_unreachable();
|
||||
#else
|
||||
#define MATHEMATICAL_FUNCTIONS_TEST_UNREACHABLE
|
||||
@ -394,10 +395,12 @@ DEFINE_UNARY_FUNCTION_EVAL(log2, 2);
|
||||
DEFINE_UNARY_FUNCTION_EVAL(log1p, 2);
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
DEFINE_UNARY_FUNCTION_EVAL(sqrt, 2);
|
||||
DEFINE_UNARY_FUNCTION_EVAL(cbrt, 2);
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
||||
DEFINE_UNARY_FUNCTION_EVAL(sin, 2);
|
||||
DEFINE_UNARY_FUNCTION_EVAL(cos, 2);
|
||||
DEFINE_UNARY_FUNCTION_EVAL(tan, 2);
|
||||
@ -483,11 +486,9 @@ DEFINE_UNARY_FUNCTION_EVAL(logb, 2);
|
||||
}; \
|
||||
constexpr char math_function_name<MathBinaryFunction_##FUNC>::name[]
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
DEFINE_BINARY_FUNCTION_EVAL(pow, 2);
|
||||
DEFINE_BINARY_FUNCTION_EVAL(hypot, 2);
|
||||
#endif
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
DEFINE_BINARY_FUNCTION_EVAL(nextafter, 1);
|
||||
DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
|
||||
#endif
|
||||
@ -519,7 +520,7 @@ DEFINE_BINARY_FUNCTION_EVAL(copysign, 1);
|
||||
}; \
|
||||
constexpr char math_function_name<MathTernaryFunction_##FUNC>::name[]
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_1
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
DEFINE_TERNARY_FUNCTION_EVAL(hypot, 2);
|
||||
DEFINE_TERNARY_FUNCTION_EVAL(fma, 2);
|
||||
#endif
|
||||
@ -787,7 +788,9 @@ TEST(TEST_CATEGORY, mathematical_functions_trigonometric_functions) {
|
||||
|
||||
// TODO atan2
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
TEST(TEST_CATEGORY, mathematical_functions_power_functions) {
|
||||
TEST_MATH_FUNCTION(sqrt)({0, 1, 2, 3, 5, 7, 11});
|
||||
TEST_MATH_FUNCTION(sqrt)({0l, 1l, 2l, 3l, 5l, 7l, 11l});
|
||||
@ -1568,6 +1571,7 @@ TEST(TEST_CATEGORY, mathematical_functions_ieee_remainder_function) {
|
||||
|
||||
// TODO: TestFpClassify, see https://github.com/kokkos/kokkos/issues/6279
|
||||
|
||||
#ifndef KOKKOS_MATHEMATICAL_FUNCTIONS_SKIP_2
|
||||
template <class Space>
|
||||
struct TestIsFinite {
|
||||
TestIsFinite() { run(); }
|
||||
@ -1591,6 +1595,7 @@ struct TestIsFinite {
|
||||
++e;
|
||||
Kokkos::printf("failed isfinite(float)\n");
|
||||
}
|
||||
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||
if (!isfinite(static_cast<KE::half_t>(2.f))
|
||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||
|| isfinite(quiet_NaN<KE::half_t>::value) ||
|
||||
@ -1611,6 +1616,7 @@ struct TestIsFinite {
|
||||
++e;
|
||||
Kokkos::printf("failed isfinite(KE::bhalf_t)\n");
|
||||
}
|
||||
#endif
|
||||
if (!isfinite(3.)
|
||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||
|| isfinite(quiet_NaN<double>::value) ||
|
||||
@ -1670,6 +1676,7 @@ struct TestIsInf {
|
||||
++e;
|
||||
Kokkos::printf("failed isinf(float)\n");
|
||||
}
|
||||
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||
if (isinf(static_cast<KE::half_t>(2.f))
|
||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||
|| isinf(quiet_NaN<KE::half_t>::value) ||
|
||||
@ -1690,6 +1697,7 @@ struct TestIsInf {
|
||||
++e;
|
||||
Kokkos::printf("failed isinf(KE::bhalf_t)\n");
|
||||
}
|
||||
#endif
|
||||
if (isinf(3.)
|
||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||
|| isinf(quiet_NaN<double>::value) ||
|
||||
@ -1748,6 +1756,7 @@ struct TestIsNaN {
|
||||
++e;
|
||||
Kokkos::printf("failed isnan(float)\n");
|
||||
}
|
||||
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||
if (isnan(static_cast<KE::half_t>(2.f))
|
||||
#ifndef KOKKOS_COMPILER_NVHPC // FIXME_NVHPC 23.7
|
||||
|| !isnan(quiet_NaN<KE::half_t>::value) ||
|
||||
@ -1777,6 +1786,7 @@ struct TestIsNaN {
|
||||
++e;
|
||||
Kokkos::printf("failed isnan(double)\n");
|
||||
}
|
||||
#endif
|
||||
#ifdef MATHEMATICAL_FUNCTIONS_HAVE_LONG_DOUBLE_OVERLOADS
|
||||
if (isnan(4.l) || !isnan(quiet_NaN<long double>::value) ||
|
||||
!isnan(signaling_NaN<long double>::value) ||
|
||||
@ -1803,6 +1813,7 @@ struct TestIsNaN {
|
||||
TEST(TEST_CATEGORY, mathematical_functions_isnan) {
|
||||
TestIsNaN<TEST_EXECSPACE>();
|
||||
}
|
||||
#endif
|
||||
|
||||
// TODO: TestSignBit, see https://github.com/kokkos/kokkos/issues/6279
|
||||
#endif
|
||||
|
||||
@ -110,8 +110,8 @@ struct TestNumericTraits {
|
||||
|
||||
KOKKOS_FUNCTION void operator()(Epsilon, int, int& e) const {
|
||||
using Kokkos::Experimental::epsilon;
|
||||
auto const eps = epsilon<T>::value;
|
||||
auto const one = T(1);
|
||||
T const eps = epsilon<T>::value;
|
||||
T const one = 1;
|
||||
// Avoid higher precision intermediate representation
|
||||
compare() = one + eps;
|
||||
e += (int)!(compare() != one);
|
||||
|
||||
@ -160,6 +160,7 @@ display_help_text() {
|
||||
echo " AMD_GFX906 = AMD GPU MI50/MI60 GFX906"
|
||||
echo " AMD_GFX908 = AMD GPU MI100 GFX908"
|
||||
echo " AMD_GFX90A = AMD GPU MI200 GFX90A"
|
||||
echo " AMD_GFX940 = AMD GPU MI300 GFX940"
|
||||
echo " AMD_GFX942 = AMD GPU MI300 GFX942"
|
||||
echo " AMD_GFX1030 = AMD GPU V620/W6800 GFX1030"
|
||||
echo " AMD_GFX1100 = AMD GPU RX 7900 XT(X) GFX1100"
|
||||
|
||||
@ -34,3 +34,4 @@ tag: 4.0.00 date: 02:23:2023 master: 5ad60966 release: 52ea2953
|
||||
tag: 4.0.01 date: 04:26:2023 master: aa1f48f3 release: 5893754f
|
||||
tag: 4.1.00 date: 06:20:2023 master: 62d2b6c8 release: adde1e6a
|
||||
tag: 4.2.00 date: 11:09:2023 master: 1a3ea28f release: abe01c88
|
||||
tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a
|
||||
|
||||
@ -224,7 +224,7 @@ template <typename T>
|
||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||
Kokkos::floor(static_cast<data_type>(a[0])));
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto ceil(
|
||||
@ -232,7 +232,7 @@ template <typename T>
|
||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||
Kokkos::ceil(static_cast<data_type>(a[0])));
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto round(
|
||||
@ -240,7 +240,7 @@ template <typename T>
|
||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||
Experimental::round_half_to_nearest_even(static_cast<data_type>(a[0])));
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION auto trunc(
|
||||
@ -248,7 +248,7 @@ template <typename T>
|
||||
using data_type = std::conditional_t<std::is_floating_point_v<T>, T, double>;
|
||||
return Experimental::simd<data_type, Experimental::simd_abi::scalar>(
|
||||
Kokkos::trunc(static_cast<data_type>(a[0])));
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
[[nodiscard]] KOKKOS_FORCEINLINE_FUNCTION
|
||||
|
||||
@ -42,6 +42,7 @@ inline void host_check_gen_ctor() {
|
||||
simd_type blend;
|
||||
blend.copy_from(expected, Kokkos::Experimental::element_aligned_tag());
|
||||
|
||||
#if !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC))
|
||||
if constexpr (std::is_same_v<Abi, Kokkos::Experimental::simd_abi::scalar>) {
|
||||
simd_type basic(KOKKOS_LAMBDA(std::size_t i) { return init[i]; });
|
||||
host_check_equality(basic, rhs, lanes);
|
||||
@ -63,6 +64,7 @@ inline void host_check_gen_ctor() {
|
||||
|
||||
host_check_equality(blend, result, lanes);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Abi, typename... DataTypes>
|
||||
|
||||
Reference in New Issue
Block a user