Files
lammps/lib/kokkos/core/unit_test/TestGraph.hpp
2024-12-13 09:23:03 -07:00

743 lines
27 KiB
C++

//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#include <Kokkos_Core.hpp>
#include <Kokkos_Graph.hpp>
#include <gtest/gtest.h>
#include <tools/include/ToolTestingUtilities.hpp>
namespace Test {
template <class ExecSpace, class ValueType>
struct NoOpReduceFunctor {
KOKKOS_FUNCTION void operator()(int, ValueType&) const {
Kokkos::abort("Should never be called!");
}
KOKKOS_FUNCTION void operator()(int, int, ValueType&) const {
Kokkos::abort("Should never be called!");
}
KOKKOS_FUNCTION void operator()(
const typename Kokkos::TeamPolicy<ExecSpace>::member_type&,
ValueType&) const {
Kokkos::abort("Should never be called!");
}
};
template <class ExecSpace>
struct CountTestFunctor {
using value_type = int;
template <class T>
using atomic_view =
Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>;
atomic_view<int> count;
atomic_view<int> bugs;
int expected_count_min;
int expected_count_max;
template <class... Ts>
KOKKOS_FUNCTION void operator()(Ts&&...) const noexcept {
bugs() += int(count() > expected_count_max || count() < expected_count_min);
count()++;
}
};
template <class ExecSpace, class T>
struct SetViewToValueFunctor {
using value_type = T;
using view_type =
Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>;
view_type v;
T value;
template <class... Ts>
KOKKOS_FUNCTION void operator()(Ts&&...) const noexcept {
v() = value;
}
};
template <class ExecSpace, class T>
struct SetResultToViewFunctor {
using value_type = T;
using view_type =
Kokkos::View<T, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>>;
view_type v;
template <class U>
KOKKOS_FUNCTION void operator()(U&&, value_type& val) const noexcept {
val += v();
}
};
struct TEST_CATEGORY_FIXTURE(graph) : public ::testing::Test {
public:
using count_functor = CountTestFunctor<TEST_EXECSPACE>;
using set_functor = SetViewToValueFunctor<TEST_EXECSPACE, int>;
using set_result_functor = SetResultToViewFunctor<TEST_EXECSPACE, int>;
using view_type = Kokkos::View<int, TEST_EXECSPACE>;
using atomic_view_type = typename count_functor::template atomic_view<int>;
using view_host = Kokkos::View<int, Kokkos::HostSpace>;
atomic_view_type count{"count"};
atomic_view_type bugs{"bugs"};
view_host count_host{"count_host"};
view_host bugs_host{"bugs_host"};
TEST_EXECSPACE ex{};
protected:
void SetUp() override {
Kokkos::deep_copy(ex, count, 0);
Kokkos::deep_copy(ex, bugs, 0);
ex.fence();
}
};
// Check if a rank-0 view contains a given value.
template <typename Exec, typename ViewType>
::testing::AssertionResult contains(
const Exec& exec, const ViewType& view,
const typename ViewType::value_type& expected) {
static_assert(ViewType::rank() == 0);
typename ViewType::non_const_value_type value;
Kokkos::deep_copy(exec, value, view);
exec.fence();
if (value != expected)
return ::testing::AssertionFailure()
<< expected << " is not in " << view.label() << ", got " << value;
else
return ::testing::AssertionSuccess();
}
TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_once) {
auto graph =
Kokkos::Experimental::create_graph<TEST_EXECSPACE>([&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
});
graph.submit();
ASSERT_TRUE(contains(graph.get_execution_space(), count, 1));
ASSERT_TRUE(contains(graph.get_execution_space(), bugs, 0));
}
TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_once_rvalue) {
Kokkos::Experimental::create_graph(ex, [&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
}).submit();
ASSERT_TRUE(contains(ex, count, 1));
ASSERT_TRUE(contains(ex, bugs, 0));
}
// Ensure that Kokkos::Graph::instantiate works.
// For now, Kokkos::Graph::submit will instantiate if needed,
// so this test is not very strong.
TEST_F(TEST_CATEGORY_FIXTURE(graph), instantiate_and_submit_once) {
auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
});
graph.instantiate();
graph.submit();
ASSERT_TRUE(contains(ex, count, 1));
ASSERT_TRUE(contains(ex, bugs, 0));
}
// FIXME death tests and fixtures
#define TEST_CATEGORY_FIXTURE_DEATH_HELPER(category, name) \
category##_##name##_DeathTest
#define TEST_CATEGORY_FIXTURE_DEATH_HELPER_EXPAND(category, name) \
TEST_CATEGORY_FIXTURE_DEATH_HELPER(category, name)
#define TEST_CATEGORY_FIXTURE_DEATH(name) \
TEST_CATEGORY_FIXTURE_DEATH_HELPER_EXPAND(TEST_CATEGORY, name)
struct TEST_CATEGORY_FIXTURE_DEATH(graph)
: public TEST_CATEGORY_FIXTURE(graph) {};
// Ensure that Kokkos::Graph::instantiate can be called only once.
// This test checks 2 cases:
// 1. Instantiating after submission is invalid (this also implicitly
// checks that submission instantiates if need be).
// 2. Instantiating twice in a row is invalid.
TEST_F(TEST_CATEGORY_FIXTURE_DEATH(graph), can_instantiate_only_once) {
::testing::FLAGS_gtest_death_test_style = "threadsafe";
{
bool checked_assertions = false;
KOKKOS_ASSERT(checked_assertions = true);
if (!checked_assertions) {
GTEST_SKIP() << "Preconditions are not checked.";
}
}
{
auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
});
graph.submit();
ASSERT_DEATH(graph.instantiate(),
"Expected precondition `.*` evaluated false.");
}
{
auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
});
graph.instantiate();
ASSERT_DEATH(graph.instantiate(),
"Expected precondition `.*` evaluated false.");
}
}
// This test submits on an execution space instance different from the
// one passed to the Kokkos::Graph constructor.
TEST_F(TEST_CATEGORY_FIXTURE(graph),
submit_onto_another_execution_space_instance) {
#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space
if (ex.concurrency() < 2)
GTEST_SKIP() << "insufficient number of supported concurrent threads";
#endif
const auto execution_space_instances =
Kokkos::Experimental::partition_space(ex, 1, 1);
auto graph = Kokkos::Experimental::create_graph(
execution_space_instances.at(0), [&](auto root) {
root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
});
graph.instantiate();
execution_space_instances.at(0).fence(
"The graph might make async copies to device.");
graph.submit(execution_space_instances.at(1));
ASSERT_TRUE(contains(execution_space_instances.at(1), count, 1));
ASSERT_TRUE(contains(execution_space_instances.at(1), bugs, 0));
}
// This test ensures that it's possible to build a Kokkos::Graph using
// Kokkos::Experimental::create_graph without providing a closure, but giving an
// execution space instance.
TEST_F(TEST_CATEGORY_FIXTURE(graph), create_graph_no_closure_with_exec) {
auto graph = Kokkos::Experimental::create_graph(ex);
auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph);
auto node = root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
graph.submit(ex);
ASSERT_TRUE(contains(ex, count, 1));
ASSERT_TRUE(contains(ex, bugs, 0));
}
// This test ensures that it's possible to build a Kokkos::Graph using
// Kokkos::Experimental::create_graph without any argument.
// The test has to be skipped if the test fixture is
// not instantiated for the default execution space.
TEST_F(TEST_CATEGORY_FIXTURE(graph), create_graph_no_arg) {
if constexpr (!std::is_same_v<TEST_EXECSPACE,
Kokkos::DefaultExecutionSpace>) {
GTEST_SKIP() << "Skipping since useless if the test fixture is not on the "
"default execution space.";
}
auto graph = Kokkos::Experimental::create_graph();
static_assert(std::is_same_v<typename decltype(graph)::execution_space,
Kokkos::DefaultExecutionSpace>);
auto root = Kokkos::Impl::GraphAccess::create_root_ref(graph);
auto node = root.then_parallel_for(1, count_functor{count, bugs, 0, 0});
graph.submit(graph.get_execution_space());
ASSERT_TRUE(contains(graph.get_execution_space(), count, 1));
ASSERT_TRUE(contains(graph.get_execution_space(), bugs, 0));
}
TEST_F(TEST_CATEGORY_FIXTURE(graph), submit_six) {
#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET team_size incompatible
if (std::is_same_v<TEST_EXECSPACE, Kokkos::Experimental::OpenMPTarget>)
GTEST_SKIP() << "skipping since OpenMPTarget can't use team_size 1";
#endif
#if defined(KOKKOS_ENABLE_SYCL) && \
!defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL
if (std::is_same_v<TEST_EXECSPACE, Kokkos::SYCL>)
GTEST_SKIP() << "skipping since test case is known to fail with SYCL";
#endif
auto graph = Kokkos::Experimental::create_graph(ex, [&](auto root) {
auto f_setup_count = root.then_parallel_for(1, set_functor{count, 0});
auto f_setup_bugs = root.then_parallel_for(1, set_functor{bugs, 0});
//----------------------------------------
auto ready = Kokkos::Experimental::when_all(f_setup_count, f_setup_bugs);
//----------------------------------------
ready.then_parallel_for(1, count_functor{count, bugs, 0, 6});
//----------------------------------------
ready.then_parallel_for(Kokkos::RangePolicy<TEST_EXECSPACE>{0, 1},
count_functor{count, bugs, 0, 6});
//----------------------------------------
ready.then_parallel_for(
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0}, {1, 1}},
count_functor{count, bugs, 0, 6});
//----------------------------------------
ready.then_parallel_for(Kokkos::TeamPolicy<TEST_EXECSPACE>{1, 1},
count_functor{count, bugs, 0, 6});
//----------------------------------------
ready.then_parallel_for(2, count_functor{count, bugs, 0, 6});
//----------------------------------------
});
graph.submit();
ASSERT_TRUE(contains(ex, count, 6));
ASSERT_TRUE(contains(ex, bugs, 0));
}
TEST_F(TEST_CATEGORY_FIXTURE(graph), when_all_cycle) {
view_type reduction_out{"reduction_out"};
view_host reduction_host{"reduction_host"};
Kokkos::Experimental::create_graph(ex, [&](auto root) {
//----------------------------------------
// Test when_all when redundant dependencies are given
auto f1 = root.then_parallel_for(1, set_functor{count, 0});
auto f2 = f1.then_parallel_for(1, count_functor{count, bugs, 0, 0});
auto f3 = f2.then_parallel_for(5, count_functor{count, bugs, 1, 5});
auto f4 = Kokkos::Experimental::when_all(f2, f3).then_parallel_for(
1, count_functor{count, bugs, 6, 6});
Kokkos::Experimental::when_all(f1, f4, f3)
.then_parallel_reduce(6, set_result_functor{count}, reduction_out);
//----------------------------------------
}).submit();
ASSERT_TRUE(contains(ex, bugs, 0));
ASSERT_TRUE(contains(ex, count, 7));
ASSERT_TRUE(contains(ex, reduction_out, 42));
//----------------------------------------
}
// This test requires that the graph execution space can access
// the host memoy space because we don't currently support copying to host,
// even asynchronously. We _may_ want to do that eventually?
TEST_F(TEST_CATEGORY_FIXTURE(graph), repeat_chain) {
constexpr bool result_not_accessible_by_exec = !Kokkos::SpaceAccessibility<
TEST_EXECSPACE, decltype(bugs_host)::memory_space>::accessible;
if constexpr (result_not_accessible_by_exec) {
GTEST_SKIP() << "The graph requires the reduction targets like 'bugs_host' "
"to be accessible by the execution space.";
} else {
auto graph = Kokkos::Experimental::create_graph(ex, [&, count_host =
count_host](
auto root) {
// FIXME_CLANG Recent clang versions would still trigger a similar
// static_assert without the additional if constexpr
constexpr bool result_not_accessible_by_exec_copy =
!Kokkos::SpaceAccessibility<
TEST_EXECSPACE, decltype(bugs_host)::memory_space>::accessible;
if constexpr (!result_not_accessible_by_exec_copy) {
//----------------------------------------
root.then_parallel_for(1, set_functor{count, 0})
.then_parallel_for(1, count_functor{count, bugs, 0, 0})
.then_parallel_for(1, count_functor{count, bugs, 1, 1})
.then_parallel_reduce(1, set_result_functor{count}, count_host)
.then_parallel_reduce(
1, set_result_functor{bugs},
Kokkos::Sum<int, Kokkos::HostSpace>{bugs_host});
//----------------------------------------
}
});
//----------------------------------------
constexpr int repeats = 10;
for (int i = 0; i < repeats; ++i) {
graph.submit();
ex.fence();
EXPECT_EQ(2, count_host());
EXPECT_EQ(0, bugs_host());
}
//----------------------------------------
}
}
TEST_F(TEST_CATEGORY_FIXTURE(graph), zero_work_reduce) {
auto graph = Kokkos::Experimental::create_graph(
ex, [&](Kokkos::Experimental::GraphNodeRef<TEST_EXECSPACE> root) {
NoOpReduceFunctor<TEST_EXECSPACE, int> no_op_functor;
root.then_parallel_reduce(Kokkos::RangePolicy<TEST_EXECSPACE>(0, 0),
no_op_functor, count)
#if !defined(KOKKOS_ENABLE_SYCL) || \
defined(SYCL_EXT_ONEAPI_GRAPH) // FIXME_SYCL
#if !defined(KOKKOS_ENABLE_CUDA) && \
!defined(KOKKOS_ENABLE_HIP) // FIXME_CUDA FIXME_HIP
.then_parallel_reduce(
Kokkos::MDRangePolicy<TEST_EXECSPACE, Kokkos::Rank<2>>{{0, 0},
{0, 0}},
no_op_functor, count)
#endif
.then_parallel_reduce(
Kokkos::TeamPolicy<TEST_EXECSPACE>{0, Kokkos::AUTO},
no_op_functor, count)
#endif
;
});
// These fences are only necessary because of the weirdness of how CUDA
// UVM works on pre pascal cards.
#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \
(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL))
Kokkos::fence();
#endif
graph.submit();
Kokkos::deep_copy(ex, count, 1);
// These fences are only necessary because of the weirdness of how CUDA
// UVM works on pre pascal cards.
#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_UVM) && \
(defined(KOKKOS_ARCH_KEPLER) || defined(KOKKOS_ARCH_MAXWELL))
if constexpr (std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) Kokkos::fence();
#endif
graph.submit();
ASSERT_TRUE(contains(ex, count, 0));
}
// Ensure that an empty graph can be submitted.
TEST_F(TEST_CATEGORY_FIXTURE(graph), empty_graph) {
auto graph = Kokkos::Experimental::create_graph(ex, [](auto) {});
graph.instantiate();
graph.submit(ex);
ex.fence();
}
template <typename ViewType>
struct ForceGlobalLaunchFunctor {
public:
static constexpr size_t count =
#if defined(KOKKOS_ENABLE_CUDA)
Kokkos::Impl::CudaTraits::ConstantMemoryUsage +
#elif defined(KOKKOS_ENABLE_HIP)
Kokkos::Impl::HIPTraits::ConstantMemoryUsage +
#endif
1;
ViewType data;
ForceGlobalLaunchFunctor(ViewType data_) : data(std::move(data_)) {}
template <typename T>
KOKKOS_FUNCTION void operator()(const T) const {
++data();
}
private:
std::byte unused[count] = {};
};
// Ensure that "global memory launch" path works.
TEST_F(TEST_CATEGORY_FIXTURE(graph), force_global_launch) {
#if defined(KOKKOS_ENABLE_CUDA)
if constexpr (!std::is_same_v<TEST_EXECSPACE, Kokkos::Cuda>) {
#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH)
if constexpr (!std::is_same_v<TEST_EXECSPACE, Kokkos::HIP>) {
#endif
GTEST_SKIP() << "This execution space does not support global launch.";
#if defined(KOKKOS_ENABLE_CUDA) || \
(defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH))
}
using value_t = int;
using view_t = Kokkos::View<value_t, TEST_EXECSPACE,
Kokkos::MemoryTraits<Kokkos::Atomic>>;
using functor_t = ForceGlobalLaunchFunctor<view_t>;
const std::string kernel_name = "Let's make it a huge kernel";
const std::string alloc_label =
kernel_name + " - GraphNodeKernel global memory functor storage";
view_t data(Kokkos::view_alloc("witness", ex));
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableAllocs());
std::optional<Kokkos::Experimental::Graph<TEST_EXECSPACE>> graph;
const void* ptr = nullptr;
uint64_t ptr_size = 0;
ASSERT_TRUE(validate_existence(
[&]() {
graph = Kokkos::Experimental::create_graph(ex, [&](const auto& root) {
auto node = root.then_parallel_for(
kernel_name,
Kokkos::Experimental::require(
Kokkos::RangePolicy<TEST_EXECSPACE>(0, functor_t::count),
Kokkos::Experimental::WorkItemProperty::HintHeavyWeight),
functor_t(data));
});
},
[&](AllocateDataEvent alloc) {
if (alloc.name != alloc_label)
return MatchDiagnostic{
false, {"Allocation name mismatch (got " + alloc.name + ')'}};
if (alloc.size < functor_t::count)
return MatchDiagnostic{
false,
{"Allocation size mismatch (expected at least " +
std::to_string(functor_t::count) + " but got " +
std::to_string(alloc.size) + ')'}};
ptr = alloc.ptr;
ptr_size = alloc.size;
return MatchDiagnostic{true};
}));
graph->instantiate();
// Fencing the default execution space instance, as the node policy
// was created without giving an instance (it used the default one).
TEST_EXECSPACE{}.fence(
"Ensure that kernel dispatch to global memory is finished "
"before submission.");
graph->submit(ex);
ASSERT_TRUE(contains(ex, data, functor_t::count));
ASSERT_TRUE(validate_event_set(
[&]() { graph.reset(); },
[&](DeallocateDataEvent dealloc) {
if (dealloc.name == alloc_label && dealloc.ptr == ptr &&
dealloc.size == ptr_size)
return MatchDiagnostic{true};
return MatchDiagnostic{
false, {"Either the name or pointer or size did not match"}};
}));
listen_tool_events(Config::DisableAll());
#endif
}
// Ensure that an empty graph on the default host execution space
// can be submitted.
TEST_F(TEST_CATEGORY_FIXTURE(graph), empty_graph_default_host_exec) {
auto graph =
Kokkos::Experimental::create_graph(Kokkos::DefaultHostExecutionSpace{});
graph.instantiate();
graph.submit();
graph.get_execution_space().fence();
}
template <typename ViewType, size_t TargetIndex, size_t NumIndices = 0>
struct FetchValuesAndContribute {
static_assert(std::is_same_v<typename ViewType::value_type,
typename ViewType::non_const_value_type>);
ViewType data;
typename ViewType::value_type value;
Kokkos::Array<size_t, NumIndices> indices{};
FetchValuesAndContribute(ViewType data_,
std::integral_constant<size_t, TargetIndex>,
typename ViewType::value_type value_)
: data(std::move(data_)), value(value_) {}
FetchValuesAndContribute(ViewType data_,
Kokkos::Array<size_t, NumIndices> indices_,
std::integral_constant<size_t, TargetIndex>,
typename ViewType::value_type value_)
: data(std::move(data_)), value(value_), indices(std::move(indices_)) {}
template <typename T>
KOKKOS_FUNCTION void operator()(const T) const {
for (const auto index : indices) data(TargetIndex) += data(index);
data(TargetIndex) += value;
}
};
template <typename ViewType, size_t TargetIndex, size_t NumIndices>
FetchValuesAndContribute(ViewType, const size_t (&)[NumIndices],
std::integral_constant<size_t, TargetIndex>,
typename ViewType::non_const_value_type)
-> FetchValuesAndContribute<ViewType, TargetIndex, NumIndices>;
// Ensure that we can handle the simple diamond use case.
//
// topology stream-based approach graph-based
//
// A A(exec_0) Using the API to add nodes, no
// / \ fence(exec_0) user-facing fence anymore because
// B C B(exec_0) C(exec_1) we'd like to rely on the graph to
// \ / fence(exec_1) enforce dependencies.
// D D(exec_0)
TEST_F(TEST_CATEGORY_FIXTURE(graph), diamond) {
#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space
if (ex.concurrency() < 4)
GTEST_SKIP() << "test needs at least 4 OpenMP threads";
#endif
const auto execution_space_instances =
Kokkos::Experimental::partition_space(ex, 1, 1, 1, 1);
const auto exec_0 = execution_space_instances.at(0);
const auto exec_1 = execution_space_instances.at(1);
const auto exec_2 = execution_space_instances.at(2);
const auto exec_3 = execution_space_instances.at(3);
using policy_t = Kokkos::RangePolicy<TEST_EXECSPACE>;
using view_t = Kokkos::View<int*, TEST_EXECSPACE>;
using view_h_t = Kokkos::View<int*, Kokkos::HostSpace>;
view_t data(Kokkos::view_alloc(ex, "diamond - data"), 4);
constexpr int value_A = 42, value_B = 27, value_C = 13, value_D = 147;
std::integral_constant<size_t, 0> index_A;
std::integral_constant<size_t, 1> index_B;
std::integral_constant<size_t, 2> index_C;
std::integral_constant<size_t, 3> index_D;
auto graph = Kokkos::Experimental::create_graph(exec_2, [&](auto root) {
auto node_A = root.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, index_A, value_A));
auto node_B = node_A.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, {index_A()}, index_B, value_B));
auto node_C = node_A.then_parallel_for(
policy_t(exec_1, 0, 1),
FetchValuesAndContribute(data, {index_A()}, index_C, value_C));
auto node_D = Kokkos::Experimental::when_all(node_B, node_C)
.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, {index_B(), index_C()},
index_D, value_D));
});
graph.instantiate();
// TODO Check that kernels are running on the execution space instance of
// their policy if the defaulted graph implementation is used.
graph.submit(exec_3);
view_h_t data_host(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "diamond - data - host"),
4);
Kokkos::deep_copy(exec_3, data_host, data);
exec_3.fence();
ASSERT_EQ(data_host(index_A()), value_A);
ASSERT_EQ(data_host(index_B()), value_A + value_B);
ASSERT_EQ(data_host(index_C()), value_A + value_C);
ASSERT_EQ(data_host(index_D()), 2 * value_A + value_B + value_C + value_D);
}
// Test a configuration that has more than one end node. Ensure that we wait for
// them all by adding a manual kernel after the graph.
// This test mainly is there to ensure that the defaulted graph implementation
// enforces a semantically consistent control flow.
//
// topology stream-based approach
//
// A B A(exec_0) B(exec_1)
// \ / | fence(exec_1)
// C | C(exec_0)
// / E E(exec_1)
// D D(exec_0)
// fence(exec_1)
// F F(exec_0)
TEST_F(TEST_CATEGORY_FIXTURE(graph), end_of_submit_control_flow) {
#ifdef KOKKOS_ENABLE_OPENMP // FIXME_OPENMP partition_space
if (ex.concurrency() < 4)
GTEST_SKIP() << "insufficient number of supported concurrent threads";
#endif
const auto execution_space_instances =
Kokkos::Experimental::partition_space(ex, 1, 1, 1, 1);
const auto exec_0 = execution_space_instances.at(0);
const auto exec_1 = execution_space_instances.at(1);
const auto exec_2 = execution_space_instances.at(2);
const auto exec_3 = execution_space_instances.at(3);
using policy_t = Kokkos::RangePolicy<TEST_EXECSPACE>;
using view_t = Kokkos::View<int*, TEST_EXECSPACE>;
using view_h_t = Kokkos::View<int*, Kokkos::HostSpace>;
view_t data(Kokkos::view_alloc(ex, "data"), 6);
constexpr int value_A = 42, value_B = 27, value_C = 13, value_D = 147,
value_E = 496, value_F = 123;
std::integral_constant<size_t, 0> index_A;
std::integral_constant<size_t, 1> index_B;
std::integral_constant<size_t, 2> index_C;
std::integral_constant<size_t, 3> index_D;
std::integral_constant<size_t, 4> index_E;
std::integral_constant<size_t, 5> index_F;
auto graph = Kokkos::Experimental::create_graph(exec_2, [&](auto root) {
auto node_A = root.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, index_A, value_A));
auto node_B = root.then_parallel_for(
policy_t(exec_1, 0, 1),
FetchValuesAndContribute(data, index_B, value_B));
auto node_C = Kokkos::Experimental::when_all(node_A, node_B)
.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, {index_A(), index_B()},
index_C, value_C));
auto node_D = node_C.then_parallel_for(
policy_t(exec_0, 0, 1),
FetchValuesAndContribute(data, {index_C()}, index_D, value_D));
auto node_E = node_B.then_parallel_for(
policy_t(exec_1, 0, 1),
FetchValuesAndContribute(data, {index_B()}, index_E, value_E));
});
graph.instantiate();
// TODO Check that kernels are running on the execution space instance of
// their policy if the defaulted graph implementation is used.
graph.submit(exec_3);
// clang-format off
Kokkos::parallel_for(
policy_t(exec_3, 0, 1),
#if defined(KOKKOS_COMPILER_GNU) && (1010 == KOKKOS_COMPILER_GNU)
// Workaround CTAD bug, see 7316.
FetchValuesAndContribute<decltype(data), index_F, 2>(data, {index_D(), index_E()}, index_F, value_F));
#else
FetchValuesAndContribute(data, {index_D(), index_E()}, index_F, value_F));
#endif
// clang-format on
view_h_t data_host(
Kokkos::view_alloc(Kokkos::WithoutInitializing, "data - host"), 6);
Kokkos::deep_copy(exec_3, data_host, data);
exec_3.fence();
ASSERT_EQ(data_host(index_A()), value_A);
ASSERT_EQ(data_host(index_B()), value_B);
ASSERT_EQ(data_host(index_C()), value_A + value_B + value_C);
ASSERT_EQ(data_host(index_D()), value_A + value_B + value_C + value_D);
ASSERT_EQ(data_host(index_E()), value_B + value_E);
ASSERT_EQ(data_host(index_F()),
value_A + 2 * value_B + value_C + value_D + value_E + value_F);
}
} // end namespace Test