1757 lines
60 KiB
C++
1757 lines
60 KiB
C++
//@HEADER
|
|
// ************************************************************************
|
|
//
|
|
// Kokkos v. 4.0
|
|
// Copyright (2022) National Technology & Engineering
|
|
// Solutions of Sandia, LLC (NTESS).
|
|
//
|
|
// Under the terms of Contract DE-NA0003525 with NTESS,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://kokkos.org/LICENSE for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//@HEADER
|
|
|
|
#include <cstdio>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
|
|
#include <Kokkos_Core.hpp>
|
|
|
|
namespace Test {
|
|
|
|
namespace {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct TestTeamPolicy {
|
|
using team_member =
|
|
typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
|
|
using view_type = Kokkos::View<int **, ExecSpace>;
|
|
|
|
view_type m_flags;
|
|
|
|
TestTeamPolicy(const size_t league_size)
|
|
: m_flags(Kokkos::view_alloc(Kokkos::WithoutInitializing, "flags"),
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace>(
|
|
1, std::is_same<ExecSpace,
|
|
Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: 1)
|
|
.team_size_max(*this, Kokkos::ParallelReduceTag()),
|
|
#else
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace>(1, 1).team_size_max(
|
|
*this, Kokkos::ParallelReduceTag()),
|
|
#endif
|
|
league_size) {
|
|
}
|
|
|
|
struct VerifyInitTag {};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const team_member &member) const {
|
|
const int tid =
|
|
member.team_rank() + member.team_size() * member.league_rank();
|
|
|
|
m_flags(member.team_rank(), member.league_rank()) = tid;
|
|
static_assert(
|
|
(std::is_same<typename team_member::execution_space, ExecSpace>::value),
|
|
"TeamMember::execution_space is not the same as "
|
|
"TeamPolicy<>::execution_space");
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const VerifyInitTag &, const team_member &member) const {
|
|
const int tid =
|
|
member.team_rank() + member.team_size() * member.league_rank();
|
|
|
|
if (tid != m_flags(member.team_rank(), member.league_rank())) {
|
|
Kokkos::printf("TestTeamPolicy member(%d,%d) error %d != %d\n",
|
|
member.league_rank(), member.team_rank(), tid,
|
|
m_flags(member.team_rank(), member.league_rank()));
|
|
}
|
|
}
|
|
|
|
// Included for test_small_league_size.
|
|
TestTeamPolicy() : m_flags() {}
|
|
|
|
// Included for test_small_league_size.
|
|
struct NoOpTag {};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const NoOpTag &, const team_member & /*member*/) const {}
|
|
|
|
static void test_small_league_size() {
|
|
int bs = 8; // batch size (number of elements per batch)
|
|
int ns = 16; // total number of "problems" to process
|
|
|
|
// Calculate total scratch memory space size.
|
|
const int level = 0;
|
|
int mem_size = 960;
|
|
const int num_teams = ns / bs;
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> policy(num_teams, Kokkos::AUTO());
|
|
|
|
Kokkos::parallel_for(
|
|
policy.set_scratch_size(level, Kokkos::PerTeam(mem_size),
|
|
Kokkos::PerThread(0)),
|
|
TestTeamPolicy());
|
|
}
|
|
|
|
static void test_constructors() {
|
|
constexpr const int smallest_work = 1;
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
|
|
smallest_work,
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: smallest_work,
|
|
smallest_work);
|
|
#else
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> none_auto(
|
|
smallest_work, smallest_work, smallest_work);
|
|
#endif
|
|
(void)none_auto;
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> both_auto(
|
|
smallest_work, Kokkos::AUTO(), Kokkos::AUTO());
|
|
(void)both_auto;
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
|
|
smallest_work,
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: smallest_work,
|
|
Kokkos::AUTO());
|
|
#else
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_vector(
|
|
smallest_work, smallest_work, Kokkos::AUTO());
|
|
#endif
|
|
(void)auto_vector;
|
|
Kokkos::TeamPolicy<ExecSpace, NoOpTag> auto_team(
|
|
smallest_work, Kokkos::AUTO(), smallest_work);
|
|
(void)auto_team;
|
|
}
|
|
|
|
static void test_for(const size_t league_size) {
|
|
{
|
|
TestTeamPolicy functor(league_size);
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
|
|
using policy_type_init =
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace, VerifyInitTag>;
|
|
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least
|
|
// 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
const int team_size =
|
|
policy_type(
|
|
league_size,
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: 1)
|
|
.team_size_max(functor, Kokkos::ParallelForTag());
|
|
const int team_size_init =
|
|
policy_type_init(
|
|
league_size,
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: 1)
|
|
.team_size_max(functor, Kokkos::ParallelForTag());
|
|
#else
|
|
const int team_size =
|
|
policy_type(league_size, 1)
|
|
.team_size_max(functor, Kokkos::ParallelForTag());
|
|
const int team_size_init =
|
|
policy_type_init(league_size, 1)
|
|
.team_size_max(functor, Kokkos::ParallelForTag());
|
|
#endif
|
|
|
|
Kokkos::parallel_for(policy_type(league_size, team_size), functor);
|
|
Kokkos::parallel_for(policy_type_init(league_size, team_size_init),
|
|
functor);
|
|
}
|
|
|
|
test_small_league_size();
|
|
test_constructors();
|
|
}
|
|
|
|
struct ReduceTag {};
|
|
|
|
using value_type = int64_t;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const team_member &member, value_type &update) const {
|
|
update += member.team_rank() + member.team_size() * member.league_rank();
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const ReduceTag &, const team_member &member,
|
|
value_type &update) const {
|
|
update +=
|
|
1 + member.team_rank() + member.team_size() * member.league_rank();
|
|
}
|
|
|
|
static void test_reduce(const size_t league_size) {
|
|
TestTeamPolicy functor(league_size);
|
|
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
|
|
using policy_type_reduce =
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace, ReduceTag>;
|
|
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
const int team_size =
|
|
policy_type_reduce(
|
|
league_size,
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32
|
|
: 1)
|
|
.team_size_max(functor, Kokkos::ParallelReduceTag());
|
|
#else
|
|
const int team_size =
|
|
policy_type_reduce(league_size, 1)
|
|
.team_size_max(functor, Kokkos::ParallelReduceTag());
|
|
#endif
|
|
|
|
const int64_t N = team_size * league_size;
|
|
|
|
int64_t total = 0;
|
|
|
|
Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
|
|
total);
|
|
ASSERT_EQ(size_t((N - 1) * (N)) / 2, size_t(total));
|
|
|
|
Kokkos::parallel_reduce(policy_type_reduce(league_size, team_size), functor,
|
|
total);
|
|
ASSERT_EQ((size_t(N) * size_t(N + 1)) / 2, size_t(total));
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
} // namespace Test
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template <typename ScalarType, class DeviceType, class ScheduleType>
|
|
class ReduceTeamFunctor {
|
|
public:
|
|
using execution_space = DeviceType;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
using size_type = typename execution_space::size_type;
|
|
|
|
struct value_type {
|
|
ScalarType value[3];
|
|
};
|
|
|
|
const size_type nwork;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
ReduceTeamFunctor(const size_type &arg_nwork) : nwork(arg_nwork) {}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
ReduceTeamFunctor(const ReduceTeamFunctor &rhs) : nwork(rhs.nwork) {}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void init(value_type &dst) const {
|
|
dst.value[0] = 0;
|
|
dst.value[1] = 0;
|
|
dst.value[2] = 0;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join(value_type &dst, const value_type &src) const {
|
|
dst.value[0] += src.value[0];
|
|
dst.value[1] += src.value[1];
|
|
dst.value[2] += src.value[2];
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const typename policy_type::member_type ind,
|
|
value_type &dst) const {
|
|
const int thread_rank =
|
|
ind.team_rank() + ind.team_size() * ind.league_rank();
|
|
const int thread_size = ind.team_size() * ind.league_size();
|
|
const int chunk = (nwork + thread_size - 1) / thread_size;
|
|
|
|
size_type iwork = chunk * thread_rank;
|
|
const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
|
|
|
|
for (; iwork < iwork_end; ++iwork) {
|
|
dst.value[0] += 1;
|
|
dst.value[1] += iwork + 1;
|
|
dst.value[2] += nwork - iwork;
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
namespace {
|
|
|
|
template <typename ScalarType, class DeviceType, class ScheduleType>
|
|
class TestReduceTeam {
|
|
public:
|
|
using execution_space = DeviceType;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
using size_type = typename execution_space::size_type;
|
|
|
|
TestReduceTeam(const size_type &nwork) { run_test(nwork); }
|
|
|
|
void run_test(const size_type &nwork) {
|
|
using functor_type =
|
|
Test::ReduceTeamFunctor<ScalarType, execution_space, ScheduleType>;
|
|
using value_type = typename functor_type::value_type;
|
|
using result_type =
|
|
Kokkos::View<value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
|
|
|
|
enum { Count = 3 };
|
|
enum { Repeat = 100 };
|
|
|
|
value_type result[Repeat];
|
|
|
|
const uint64_t nw = nwork;
|
|
const uint64_t nsum = nw % 2 ? nw * ((nw + 1) / 2) : (nw / 2) * (nw + 1);
|
|
|
|
policy_type team_exec(nw, 1);
|
|
|
|
const unsigned team_size = team_exec.team_size_recommended(
|
|
functor_type(nwork), Kokkos::ParallelReduceTag());
|
|
const unsigned league_size = (nwork + team_size - 1) / team_size;
|
|
|
|
team_exec = policy_type(league_size, team_size);
|
|
|
|
for (unsigned i = 0; i < Repeat; ++i) {
|
|
result_type tmp(&result[i]);
|
|
Kokkos::parallel_reduce(team_exec, functor_type(nwork), tmp);
|
|
}
|
|
|
|
execution_space().fence();
|
|
|
|
for (unsigned i = 0; i < Repeat; ++i) {
|
|
for (unsigned j = 0; j < Count; ++j) {
|
|
const uint64_t correct = 0 == j % 3 ? nw : nsum;
|
|
ASSERT_EQ((ScalarType)correct, result[i].value[j]);
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template <class DeviceType, class ScheduleType>
|
|
class ScanTeamFunctor {
|
|
public:
|
|
using execution_space = DeviceType;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
using value_type = int64_t;
|
|
|
|
Kokkos::View<value_type, execution_space> accum;
|
|
Kokkos::View<value_type, execution_space> total;
|
|
|
|
ScanTeamFunctor() : accum("accum"), total("total") {}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void init(value_type &error) const { error = 0; }
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join(value_type &error, value_type const &input) const {
|
|
if (input) error = 1;
|
|
}
|
|
|
|
struct JoinMax {
|
|
using value_type = int64_t;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join(value_type &dst, value_type const &input) const {
|
|
if (dst < input) dst = input;
|
|
}
|
|
};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const typename policy_type::member_type ind,
|
|
value_type &error) const {
|
|
if (0 == ind.league_rank() && 0 == ind.team_rank()) {
|
|
const int64_t thread_count = ind.league_size() * ind.team_size();
|
|
total() = (thread_count * (thread_count + 1)) / 2;
|
|
}
|
|
|
|
// Team max:
|
|
int64_t m = (int64_t)(ind.league_rank() + ind.team_rank());
|
|
ind.team_reduce(Kokkos::Max<int64_t>(m));
|
|
|
|
if (m != ind.league_rank() + (ind.team_size() - 1)) {
|
|
Kokkos::printf(
|
|
"ScanTeamFunctor[%i.%i of %i.%i] reduce_max_answer(%li) != "
|
|
"reduce_max(%li)\n",
|
|
static_cast<int>(ind.league_rank()),
|
|
static_cast<int>(ind.team_rank()),
|
|
static_cast<int>(ind.league_size()),
|
|
static_cast<int>(ind.team_size()),
|
|
static_cast<long>(ind.league_rank() + (ind.team_size() - 1)),
|
|
static_cast<long>(m));
|
|
}
|
|
|
|
// Scan:
|
|
const int64_t answer = (ind.league_rank() + 1) * ind.team_rank() +
|
|
(ind.team_rank() * (ind.team_rank() + 1)) / 2;
|
|
|
|
const int64_t result =
|
|
ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
|
|
|
|
const int64_t result2 =
|
|
ind.team_scan(ind.league_rank() + 1 + ind.team_rank() + 1);
|
|
|
|
if (answer != result || answer != result2) {
|
|
Kokkos::printf(
|
|
"ScanTeamFunctor[%i.%i of %i.%i] answer(%li) != scan_first(%li) or "
|
|
"scan_second(%li)\n",
|
|
static_cast<int>(ind.league_rank()),
|
|
static_cast<int>(ind.team_rank()),
|
|
static_cast<int>(ind.league_size()),
|
|
static_cast<int>(ind.team_size()), static_cast<long>(answer),
|
|
static_cast<long>(result), static_cast<long>(result2));
|
|
|
|
error = 1;
|
|
}
|
|
|
|
const int64_t thread_rank =
|
|
ind.team_rank() + ind.team_size() * ind.league_rank();
|
|
ind.team_scan(1 + thread_rank, accum.data());
|
|
}
|
|
};
|
|
|
|
template <class DeviceType, class ScheduleType>
|
|
class TestScanTeam {
|
|
public:
|
|
using execution_space = DeviceType;
|
|
using value_type = int64_t;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
using functor_type = Test::ScanTeamFunctor<DeviceType, ScheduleType>;
|
|
|
|
TestScanTeam(const size_t nteam) { run_test(nteam); }
|
|
|
|
void run_test(const size_t nteam) {
|
|
using result_type =
|
|
Kokkos::View<int64_t, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>;
|
|
|
|
const unsigned REPEAT = 100000;
|
|
unsigned Repeat;
|
|
|
|
if (nteam == 0) {
|
|
Repeat = 1;
|
|
} else {
|
|
Repeat = (REPEAT + nteam - 1) / nteam; // Error here.
|
|
}
|
|
|
|
functor_type functor;
|
|
|
|
policy_type team_exec(nteam, 1);
|
|
const auto team_size =
|
|
team_exec.team_size_max(functor, Kokkos::ParallelReduceTag());
|
|
team_exec = policy_type(nteam, team_size);
|
|
|
|
for (unsigned i = 0; i < Repeat; ++i) {
|
|
int64_t accum = 0;
|
|
int64_t total = 0;
|
|
int64_t error = 0;
|
|
Kokkos::deep_copy(functor.accum, total);
|
|
|
|
Kokkos::parallel_reduce(team_exec, functor, result_type(&error));
|
|
DeviceType().fence();
|
|
|
|
Kokkos::deep_copy(accum, functor.accum);
|
|
Kokkos::deep_copy(total, functor.total);
|
|
|
|
ASSERT_EQ(error, 0);
|
|
ASSERT_EQ(total, accum);
|
|
}
|
|
|
|
execution_space().fence();
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct SharedTeamFunctor {
|
|
using execution_space = ExecSpace;
|
|
using value_type = int;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
|
|
enum { SHARED_COUNT = 1000 };
|
|
|
|
using shmem_space = typename ExecSpace::scratch_memory_space;
|
|
|
|
// TBD: MemoryUnmanaged should be the default for shared memory space.
|
|
using shared_int_array_type =
|
|
Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
|
|
|
|
// Tell how much shared memory will be required by this functor.
|
|
inline unsigned team_shmem_size(int /*team_size*/) const {
|
|
return shared_int_array_type::shmem_size(SHARED_COUNT) +
|
|
shared_int_array_type::shmem_size(SHARED_COUNT);
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const typename policy_type::member_type &ind,
|
|
value_type &update) const {
|
|
const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
|
|
const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
|
|
|
|
if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
|
|
(shared_B.data() == nullptr && SHARED_COUNT > 0)) {
|
|
Kokkos::printf(
|
|
"member( %i/%i , %i/%i ) Failed to allocate shared memory of size "
|
|
"%lu\n",
|
|
static_cast<int>(ind.league_rank()),
|
|
static_cast<int>(ind.league_size()),
|
|
static_cast<int>(ind.team_rank()), static_cast<int>(ind.team_size()),
|
|
static_cast<unsigned long>(SHARED_COUNT));
|
|
|
|
++update; // Failure to allocate is an error.
|
|
} else {
|
|
for (int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size()) {
|
|
shared_A[i] = i + ind.league_rank();
|
|
shared_B[i] = 2 * i + ind.league_rank();
|
|
}
|
|
|
|
ind.team_barrier();
|
|
|
|
if (ind.team_rank() + 1 == ind.team_size()) {
|
|
for (int i = 0; i < SHARED_COUNT; ++i) {
|
|
if (shared_A[i] != i + ind.league_rank()) {
|
|
++update;
|
|
}
|
|
|
|
if (shared_B[i] != 2 * i + ind.league_rank()) {
|
|
++update;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
namespace {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct TestSharedTeam {
|
|
TestSharedTeam() { run(); }
|
|
|
|
void run() {
|
|
using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
|
|
using result_type =
|
|
Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
|
|
Kokkos::MemoryUnmanaged>;
|
|
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
const size_t team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? Kokkos::TeamPolicy<ScheduleType, ExecSpace>(64, 32).team_size_max(
|
|
Functor(), Kokkos::ParallelReduceTag())
|
|
: Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1)
|
|
.team_size_max(Functor(), Kokkos::ParallelReduceTag());
|
|
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? 32 / team_size
|
|
: 8192 / team_size,
|
|
team_size);
|
|
#else
|
|
const size_t team_size =
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace>(8192, 1).team_size_max(
|
|
Functor(), Kokkos::ParallelReduceTag());
|
|
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
|
|
team_size);
|
|
#endif
|
|
|
|
typename Functor::value_type error_count = 0;
|
|
|
|
Kokkos::parallel_reduce(team_exec, Functor(), result_type(&error_count));
|
|
Kokkos::fence();
|
|
|
|
ASSERT_EQ(error_count, 0);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace Test {
|
|
|
|
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
|
template <class MemorySpace, class ExecSpace, class ScheduleType>
|
|
struct TestLambdaSharedTeam {
|
|
TestLambdaSharedTeam() { run(); }
|
|
|
|
void run() {
|
|
using Functor = Test::SharedTeamFunctor<ExecSpace, ScheduleType>;
|
|
using result_type = Kokkos::View<typename Functor::value_type, MemorySpace,
|
|
Kokkos::MemoryUnmanaged>;
|
|
|
|
using shmem_space = typename ExecSpace::scratch_memory_space;
|
|
|
|
// TBD: MemoryUnmanaged should be the default for shared memory space.
|
|
using shared_int_array_type =
|
|
Kokkos::View<int *, shmem_space, Kokkos::MemoryUnmanaged>;
|
|
|
|
const int SHARED_COUNT = 1000;
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int team_size = 1;
|
|
#endif
|
|
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
if (std::is_same<ExecSpace, Kokkos::Cuda>::value) team_size = 128;
|
|
#endif
|
|
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace> team_exec(8192 / team_size,
|
|
team_size);
|
|
|
|
int scratch_size = shared_int_array_type::shmem_size(SHARED_COUNT) * 2;
|
|
team_exec = team_exec.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
|
|
|
|
typename Functor::value_type error_count = 0;
|
|
|
|
Kokkos::parallel_reduce(
|
|
team_exec,
|
|
KOKKOS_LAMBDA(
|
|
const typename Kokkos::TeamPolicy<ScheduleType,
|
|
ExecSpace>::member_type &ind,
|
|
int &update) {
|
|
const shared_int_array_type shared_A(ind.team_shmem(), SHARED_COUNT);
|
|
const shared_int_array_type shared_B(ind.team_shmem(), SHARED_COUNT);
|
|
|
|
if ((shared_A.data() == nullptr && SHARED_COUNT > 0) ||
|
|
(shared_B.data() == nullptr && SHARED_COUNT > 0)) {
|
|
Kokkos::printf("Failed to allocate shared memory of size %lu\n",
|
|
static_cast<unsigned long>(SHARED_COUNT));
|
|
|
|
++update; // Failure to allocate is an error.
|
|
} else {
|
|
for (int i = ind.team_rank(); i < SHARED_COUNT;
|
|
i += ind.team_size()) {
|
|
shared_A[i] = i + ind.league_rank();
|
|
shared_B[i] = 2 * i + ind.league_rank();
|
|
}
|
|
|
|
ind.team_barrier();
|
|
|
|
if (ind.team_rank() + 1 == ind.team_size()) {
|
|
for (int i = 0; i < SHARED_COUNT; ++i) {
|
|
if (shared_A[i] != i + ind.league_rank()) {
|
|
++update;
|
|
}
|
|
|
|
if (shared_B[i] != 2 * i + ind.league_rank()) {
|
|
++update;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
result_type(&error_count));
|
|
|
|
Kokkos::fence();
|
|
|
|
ASSERT_EQ(error_count, 0);
|
|
}
|
|
};
|
|
#endif
|
|
|
|
} // namespace Test
|
|
|
|
namespace Test {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct ScratchTeamFunctor {
|
|
using execution_space = ExecSpace;
|
|
using value_type = int;
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, execution_space>;
|
|
|
|
enum { SHARED_TEAM_COUNT = 100 };
|
|
enum { SHARED_THREAD_COUNT = 10 };
|
|
|
|
using shmem_space = typename ExecSpace::scratch_memory_space;
|
|
|
|
// TBD: MemoryUnmanaged should be the default for shared memory space.
|
|
using shared_int_array_type =
|
|
Kokkos::View<size_t *, shmem_space, Kokkos::MemoryUnmanaged>;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const typename policy_type::member_type &ind,
|
|
value_type &update) const {
|
|
const shared_int_array_type scratch_ptr(ind.team_scratch(1),
|
|
3 * ind.team_size());
|
|
const shared_int_array_type scratch_A(ind.team_scratch(1),
|
|
SHARED_TEAM_COUNT);
|
|
const shared_int_array_type scratch_B(ind.thread_scratch(1),
|
|
SHARED_THREAD_COUNT);
|
|
|
|
if ((scratch_ptr.data() == nullptr) ||
|
|
(scratch_A.data() == nullptr && SHARED_TEAM_COUNT > 0) ||
|
|
(scratch_B.data() == nullptr && SHARED_THREAD_COUNT > 0)) {
|
|
Kokkos::printf("Failed to allocate shared memory of size %lu\n",
|
|
static_cast<unsigned long>(SHARED_TEAM_COUNT));
|
|
|
|
++update; // Failure to allocate is an error.
|
|
} else {
|
|
Kokkos::parallel_for(
|
|
Kokkos::TeamThreadRange(ind, 0, (int)SHARED_TEAM_COUNT),
|
|
[&](const int &i) { scratch_A[i] = i + ind.league_rank(); });
|
|
|
|
for (int i = 0; i < SHARED_THREAD_COUNT; i++) {
|
|
scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
|
|
}
|
|
|
|
scratch_ptr[ind.team_rank()] = (size_t)scratch_A.data();
|
|
scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t)scratch_B.data();
|
|
|
|
ind.team_barrier();
|
|
|
|
for (int i = 0; i < SHARED_TEAM_COUNT; i++) {
|
|
if (scratch_A[i] != size_t(i + ind.league_rank())) ++update;
|
|
}
|
|
|
|
for (int i = 0; i < ind.team_size(); i++) {
|
|
if (scratch_ptr[0] != scratch_ptr[i]) ++update;
|
|
}
|
|
|
|
if (scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
|
|
SHARED_THREAD_COUNT * sizeof(size_t)) {
|
|
++update;
|
|
}
|
|
|
|
for (int i = 1; i < ind.team_size(); i++) {
|
|
if ((scratch_ptr[i + ind.team_size()] -
|
|
scratch_ptr[i - 1 + ind.team_size()]) !=
|
|
(scratch_ptr[1 + ind.team_size()] -
|
|
scratch_ptr[0 + ind.team_size()])) {
|
|
++update;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
namespace {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct TestScratchTeam {
|
|
TestScratchTeam() { run(); }
|
|
|
|
void run() {
|
|
using Functor = Test::ScratchTeamFunctor<ExecSpace, ScheduleType>;
|
|
using result_type =
|
|
Kokkos::View<typename Functor::value_type, Kokkos::HostSpace,
|
|
Kokkos::MemoryUnmanaged>;
|
|
using p_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
|
|
|
|
typename Functor::value_type error_count = 0;
|
|
|
|
int thread_scratch_size = Functor::shared_int_array_type::shmem_size(
|
|
Functor::SHARED_THREAD_COUNT);
|
|
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
p_type team_exec =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? p_type(64, 32).set_scratch_size(
|
|
1,
|
|
Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
|
|
Functor::SHARED_TEAM_COUNT)),
|
|
Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)))
|
|
: p_type(8192, 1).set_scratch_size(
|
|
1,
|
|
Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
|
|
Functor::SHARED_TEAM_COUNT)),
|
|
Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
|
|
#else
|
|
p_type team_exec = p_type(8192, 1).set_scratch_size(
|
|
1,
|
|
Kokkos::PerTeam(Functor::shared_int_array_type::shmem_size(
|
|
Functor::SHARED_TEAM_COUNT)),
|
|
Kokkos::PerThread(thread_scratch_size + 3 * sizeof(int)));
|
|
#endif
|
|
|
|
const size_t team_size =
|
|
team_exec.team_size_max(Functor(), Kokkos::ParallelReduceTag());
|
|
|
|
int team_scratch_size =
|
|
Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
|
|
Functor::shared_int_array_type::shmem_size(3 * team_size);
|
|
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
team_exec =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value
|
|
? p_type(64 / team_size, team_size)
|
|
: p_type(8192 / team_size, team_size);
|
|
#else
|
|
team_exec = p_type(8192 / team_size, team_size);
|
|
#endif
|
|
|
|
Kokkos::parallel_reduce(
|
|
team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
|
|
Kokkos::PerThread(thread_scratch_size)),
|
|
Functor(), result_type(&error_count));
|
|
Kokkos::fence();
|
|
ASSERT_EQ(error_count, 0);
|
|
|
|
Kokkos::parallel_reduce(
|
|
team_exec.set_scratch_size(1, Kokkos::PerTeam(team_scratch_size),
|
|
Kokkos::PerThread(thread_scratch_size)),
|
|
Functor(), Kokkos::Sum<typename Functor::value_type>(error_count));
|
|
Kokkos::fence();
|
|
ASSERT_EQ(error_count, 0);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace Test {
|
|
|
|
template <class ExecSpace>
|
|
KOKKOS_INLINE_FUNCTION int test_team_mulit_level_scratch_loop_body(
|
|
const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_team1(team.team_scratch(0), 128);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_thread1(team.thread_scratch(0), 16);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_team2(team.team_scratch(0), 128);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_thread2(team.thread_scratch(0), 16);
|
|
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_team1(team.team_scratch(1), 12800);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_thread1(team.thread_scratch(1), 1600);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_team2(team.team_scratch(1), 12800);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_thread2(team.thread_scratch(1), 1600);
|
|
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_team3(team.team_scratch(0), 128);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
a_thread3(team.thread_scratch(0), 16);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_team3(team.team_scratch(1), 12800);
|
|
Kokkos::View<double *, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>>
|
|
b_thread3(team.thread_scratch(1), 1600);
|
|
|
|
// The explicit types for 0 and 128 are here to test TeamThreadRange accepting
|
|
// different types for begin and end.
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(128)),
|
|
[&](const int &i) {
|
|
a_team1(i) = 1000000 + i + team.league_rank() * 100000;
|
|
a_team2(i) = 2000000 + i + team.league_rank() * 100000;
|
|
a_team3(i) = 3000000 + i + team.league_rank() * 100000;
|
|
});
|
|
team.team_barrier();
|
|
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, int(0), unsigned(16)),
|
|
[&](const int &i) {
|
|
a_thread1(i) = 1000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
a_thread2(i) = 2000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
a_thread3(i) = 3000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
});
|
|
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, int(0), unsigned(12800)),
|
|
[&](const int &i) {
|
|
b_team1(i) = 1000000 + i + team.league_rank() * 100000;
|
|
b_team2(i) = 2000000 + i + team.league_rank() * 100000;
|
|
b_team3(i) = 3000000 + i + team.league_rank() * 100000;
|
|
});
|
|
team.team_barrier();
|
|
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 1600),
|
|
[&](const int &i) {
|
|
b_thread1(i) = 1000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
b_thread2(i) = 2000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
b_thread3(i) = 3000000 + 100000 * team.team_rank() +
|
|
16 - i + team.league_rank() * 100000;
|
|
});
|
|
|
|
team.team_barrier();
|
|
|
|
int error = 0;
|
|
Kokkos::parallel_for(
|
|
Kokkos::TeamThreadRange(team, 0, 128), [&](const int &i) {
|
|
if (a_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
|
|
if (a_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
|
|
if (a_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
|
|
});
|
|
team.team_barrier();
|
|
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, 16), [&](const int &i) {
|
|
if (a_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
if (a_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
if (a_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
});
|
|
|
|
Kokkos::parallel_for(
|
|
Kokkos::TeamThreadRange(team, 0, 12800), [&](const int &i) {
|
|
if (b_team1(i) != 1000000 + i + team.league_rank() * 100000) error++;
|
|
if (b_team2(i) != 2000000 + i + team.league_rank() * 100000) error++;
|
|
if (b_team3(i) != 3000000 + i + team.league_rank() * 100000) error++;
|
|
});
|
|
team.team_barrier();
|
|
|
|
Kokkos::parallel_for(
|
|
Kokkos::ThreadVectorRange(team, 1600), [&](const int &i) {
|
|
if (b_thread1(i) != 1000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
if (b_thread2(i) != 2000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
if (b_thread3(i) != 3000000 + 100000 * team.team_rank() + 16 - i +
|
|
team.league_rank() * 100000)
|
|
error++;
|
|
});
|
|
|
|
return error;
|
|
}
|
|
|
|
struct TagReduce {};
|
|
struct TagFor {};
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct ClassNoShmemSizeFunction {
|
|
using member_type =
|
|
typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
|
|
|
|
Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const TagFor &, const member_type &team) const {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const TagReduce &, const member_type &team,
|
|
int &error) const {
|
|
error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
}
|
|
|
|
void run() {
|
|
Kokkos::View<int, ExecSpace> d_errors =
|
|
Kokkos::View<int, ExecSpace>("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team0 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
|
|
const int per_team1 =
|
|
3 * Kokkos::View<
|
|
double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
|
|
const int per_thread1 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
|
|
|
|
#ifdef KOKKOS_ENABLE_SYCL
|
|
int team_size = 4;
|
|
#else
|
|
int team_size = 8;
|
|
#endif
|
|
int const concurrency = ExecSpace().concurrency();
|
|
if (team_size > concurrency) team_size = concurrency;
|
|
{
|
|
Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
|
|
16);
|
|
|
|
Kokkos::parallel_for(
|
|
policy
|
|
.set_scratch_size(0, Kokkos::PerTeam(per_team0),
|
|
Kokkos::PerThread(per_thread0))
|
|
.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
*this);
|
|
Kokkos::fence();
|
|
|
|
typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
|
|
Kokkos::create_mirror_view(d_errors);
|
|
Kokkos::deep_copy(h_errors, d_errors);
|
|
ASSERT_EQ(h_errors(), 0);
|
|
}
|
|
|
|
{
|
|
int error = 0;
|
|
Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
|
|
10, team_size, 16);
|
|
|
|
Kokkos::parallel_reduce(
|
|
policy
|
|
.set_scratch_size(0, Kokkos::PerTeam(per_team0),
|
|
Kokkos::PerThread(per_thread0))
|
|
.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
*this, error);
|
|
|
|
ASSERT_EQ(error, 0);
|
|
}
|
|
};
|
|
};
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct ClassWithShmemSizeFunction {
|
|
using member_type =
|
|
typename Kokkos::TeamPolicy<ExecSpace, ScheduleType>::member_type;
|
|
|
|
Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const TagFor &, const member_type &team) const {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const TagReduce &, const member_type &team,
|
|
int &error) const {
|
|
error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
}
|
|
|
|
void run() {
|
|
Kokkos::View<int, ExecSpace> d_errors =
|
|
Kokkos::View<int, ExecSpace>("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team1 =
|
|
3 * Kokkos::View<
|
|
double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
|
|
const int per_thread1 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
|
|
|
|
int team_size = 8;
|
|
|
|
int const concurrency = ExecSpace().concurrency();
|
|
if (team_size > concurrency) team_size = concurrency;
|
|
|
|
{
|
|
Kokkos::TeamPolicy<TagFor, ExecSpace, ScheduleType> policy(10, team_size,
|
|
16);
|
|
|
|
Kokkos::parallel_for(
|
|
policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
*this);
|
|
Kokkos::fence();
|
|
|
|
typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
|
|
Kokkos::create_mirror_view(d_errors);
|
|
Kokkos::deep_copy(h_errors, d_errors);
|
|
ASSERT_EQ(h_errors(), 0);
|
|
}
|
|
|
|
{
|
|
int error = 0;
|
|
Kokkos::TeamPolicy<TagReduce, ExecSpace, ScheduleType> policy(
|
|
10, team_size, 16);
|
|
|
|
Kokkos::parallel_reduce(
|
|
policy.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
*this, error);
|
|
|
|
ASSERT_EQ(error, 0);
|
|
}
|
|
};
|
|
|
|
unsigned team_shmem_size(int team_size) const {
|
|
const int per_team0 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
return per_team0 + team_size * per_thread0;
|
|
}
|
|
};
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
void test_team_mulit_level_scratch_test_lambda() {
|
|
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
|
Kokkos::View<int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic>> errors;
|
|
Kokkos::View<int, ExecSpace> d_errors("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team0 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 =
|
|
3 * Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
|
|
const int per_team1 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(12800);
|
|
const int per_thread1 =
|
|
3 *
|
|
Kokkos::View<double *, ExecSpace,
|
|
Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(1600);
|
|
|
|
#ifdef KOKKOS_ENABLE_SYCL
|
|
int team_size = 4;
|
|
#else
|
|
int team_size = 8;
|
|
#endif
|
|
int const concurrency = ExecSpace().concurrency();
|
|
if (team_size > concurrency) team_size = concurrency;
|
|
|
|
Kokkos::TeamPolicy<ExecSpace, ScheduleType> policy(10, team_size, 16);
|
|
|
|
Kokkos::parallel_for(
|
|
policy
|
|
.set_scratch_size(0, Kokkos::PerTeam(per_team0),
|
|
Kokkos::PerThread(per_thread0))
|
|
.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
KOKKOS_LAMBDA(
|
|
const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
});
|
|
Kokkos::fence();
|
|
|
|
typename Kokkos::View<int, ExecSpace>::HostMirror h_errors =
|
|
Kokkos::create_mirror_view(errors);
|
|
Kokkos::deep_copy(h_errors, d_errors);
|
|
ASSERT_EQ(h_errors(), 0);
|
|
|
|
int error = 0;
|
|
Kokkos::parallel_reduce(
|
|
policy
|
|
.set_scratch_size(0, Kokkos::PerTeam(per_team0),
|
|
Kokkos::PerThread(per_thread0))
|
|
.set_scratch_size(1, Kokkos::PerTeam(per_team1),
|
|
Kokkos::PerThread(per_thread1)),
|
|
KOKKOS_LAMBDA(
|
|
const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team,
|
|
int &count) {
|
|
count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
},
|
|
error);
|
|
ASSERT_EQ(error, 0);
|
|
#endif
|
|
}
|
|
|
|
} // namespace Test
|
|
|
|
namespace {
|
|
|
|
template <class ExecSpace, class ScheduleType>
|
|
struct TestMultiLevelScratchTeam {
|
|
TestMultiLevelScratchTeam() { run(); }
|
|
|
|
void run() {
|
|
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
|
Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
|
|
#endif
|
|
Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
|
|
c1.run();
|
|
|
|
Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
|
|
c2.run();
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace Test {
|
|
|
|
template <class ExecSpace>
|
|
struct TestShmemSize {
|
|
TestShmemSize() { run(); }
|
|
|
|
void run() {
|
|
using view_type = Kokkos::View<int64_t ***, ExecSpace>;
|
|
|
|
size_t d1 = 5;
|
|
size_t d2 = 6;
|
|
size_t d3 = 7;
|
|
|
|
size_t size = view_type::shmem_size(d1, d2, d3);
|
|
|
|
ASSERT_EQ(size, (d1 * d2 * d3 + 1) * sizeof(int64_t));
|
|
|
|
test_layout_stride();
|
|
}
|
|
|
|
void test_layout_stride() {
|
|
int rank = 3;
|
|
int order[3] = {2, 0, 1};
|
|
int extents[3] = {100, 10, 3};
|
|
auto s1 =
|
|
Kokkos::View<double ***, Kokkos::LayoutStride, ExecSpace>::shmem_size(
|
|
Kokkos::LayoutStride::order_dimensions(rank, order, extents));
|
|
auto s2 =
|
|
Kokkos::View<double ***, Kokkos::LayoutRight, ExecSpace>::shmem_size(
|
|
extents[0], extents[1], extents[2]);
|
|
ASSERT_EQ(s1, s2);
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
namespace {
|
|
|
|
template <class ExecSpace, class ScheduleType, class T, class Enabled = void>
|
|
struct TestTeamBroadcast;
|
|
|
|
template <class ExecSpace, class ScheduleType, class T>
|
|
struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
|
|
std::enable_if_t<(sizeof(T) == sizeof(char)), void>> {
|
|
using team_member =
|
|
typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
|
|
using memory_space = typename ExecSpace::memory_space;
|
|
using value_type = T;
|
|
|
|
const value_type offset;
|
|
|
|
TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
|
|
: offset(os_) {}
|
|
|
|
struct BroadcastTag {};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const team_member &teamMember, value_type &update) const {
|
|
int lid = teamMember.league_rank();
|
|
int tid = teamMember.team_rank();
|
|
int ts = teamMember.team_size();
|
|
|
|
value_type parUpdate = 0;
|
|
value_type value = (value_type)(tid % 0xFF) + offset;
|
|
|
|
// broadcast boolean and value to team from source thread
|
|
teamMember.team_broadcast(value, lid % ts);
|
|
|
|
Kokkos::parallel_reduce(
|
|
Kokkos::TeamThreadRange(teamMember, ts),
|
|
[&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
|
|
Kokkos::BOr<value_type, memory_space>(parUpdate));
|
|
|
|
if (teamMember.team_rank() == 0) update |= parUpdate;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const BroadcastTag &, const team_member &teamMember,
|
|
value_type &update) const {
|
|
int lid = teamMember.league_rank();
|
|
int tid = teamMember.team_rank();
|
|
int ts = teamMember.team_size();
|
|
|
|
value_type parUpdate = 0;
|
|
value_type value = (value_type)(tid % 0xFF) + offset;
|
|
|
|
teamMember.team_broadcast([&](value_type &var) { var -= offset; }, value,
|
|
lid % ts);
|
|
|
|
Kokkos::parallel_reduce(
|
|
Kokkos::TeamThreadRange(teamMember, ts),
|
|
[&](const int /*j*/, value_type &teamUpdate) { teamUpdate |= value; },
|
|
Kokkos::BOr<value_type, memory_space>(parUpdate));
|
|
|
|
if (teamMember.team_rank() == 0) update |= parUpdate;
|
|
}
|
|
|
|
static void test_teambroadcast(const size_t league_size,
|
|
const value_type off) {
|
|
TestTeamBroadcast functor(league_size, off);
|
|
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
|
|
using policy_type_f =
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
|
|
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int fake_team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int fake_team_size = 1;
|
|
#endif
|
|
const int team_size =
|
|
policy_type_f(league_size, fake_team_size)
|
|
.team_size_max(
|
|
functor,
|
|
Kokkos::
|
|
ParallelReduceTag()); // printf("team_size=%d\n",team_size);
|
|
|
|
// team_broadcast with value
|
|
value_type total = 0;
|
|
|
|
Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
|
|
Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
|
|
|
|
value_type expected_result = 0;
|
|
for (unsigned int i = 0; i < league_size; i++) {
|
|
value_type val = (value_type((i % team_size % 0xFF)) + off);
|
|
expected_result |= val;
|
|
}
|
|
ASSERT_EQ(expected_result, total);
|
|
// printf("team_broadcast with value --"
|
|
//"expected_result=%x,"
|
|
//"total=%x\n",expected_result, total);
|
|
|
|
// team_broadcast with function object
|
|
total = 0;
|
|
|
|
Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
|
|
Kokkos::BOr<value_type, Kokkos::HostSpace>(total));
|
|
|
|
expected_result = 0;
|
|
for (unsigned int i = 0; i < league_size; i++) {
|
|
value_type val = ((value_type)((i % team_size % 0xFF)));
|
|
expected_result |= val;
|
|
}
|
|
ASSERT_EQ(expected_result, total);
|
|
// printf("team_broadcast with function object --"
|
|
// "expected_result=%x,"
|
|
// "total=%x\n",expected_result, total);
|
|
}
|
|
};
|
|
|
|
template <class ExecSpace, class ScheduleType, class T>
|
|
struct TestTeamBroadcast<ExecSpace, ScheduleType, T,
|
|
std::enable_if_t<(sizeof(T) > sizeof(char)), void>> {
|
|
using team_member =
|
|
typename Kokkos::TeamPolicy<ScheduleType, ExecSpace>::member_type;
|
|
using value_type = T;
|
|
|
|
const value_type offset;
|
|
|
|
TestTeamBroadcast(const size_t /*league_size*/, const value_type os_)
|
|
: offset(os_) {}
|
|
|
|
struct BroadcastTag {};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const team_member &teamMember, value_type &update) const {
|
|
int lid = teamMember.league_rank();
|
|
int tid = teamMember.team_rank();
|
|
int ts = teamMember.team_size();
|
|
|
|
value_type parUpdate = 0;
|
|
value_type value = (value_type)(tid * 3) + offset;
|
|
|
|
// setValue is used to determine if the update should be
|
|
// performed at the bottom. The thread id must match the
|
|
// thread id used to broadcast the value. It is the
|
|
// thread id that matches the league rank mod team size
|
|
// this way each league rank will use a different thread id
|
|
// which is likely not 0
|
|
bool setValue = ((lid % ts) == tid);
|
|
|
|
// broadcast boolean and value to team from source thread
|
|
teamMember.team_broadcast(value, lid % ts);
|
|
teamMember.team_broadcast(setValue, lid % ts);
|
|
|
|
Kokkos::parallel_reduce(
|
|
Kokkos::TeamThreadRange(teamMember, ts),
|
|
[&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
|
|
parUpdate);
|
|
|
|
if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()(const BroadcastTag &, const team_member &teamMember,
|
|
value_type &update) const {
|
|
int lid = teamMember.league_rank();
|
|
int tid = teamMember.team_rank();
|
|
int ts = teamMember.team_size();
|
|
|
|
value_type parUpdate = 0;
|
|
value_type value = (value_type)(tid * 3) + offset;
|
|
|
|
// setValue is used to determine if the update should be
|
|
// performed at the bottom. The thread id must match the
|
|
// thread id used to broadcast the value. It is the
|
|
// thread id that matches the league rank mod team size
|
|
// this way each league rank will use a different thread id
|
|
// which is likely not 0. Note the logic is switched from
|
|
// above because the functor switches it back.
|
|
bool setValue = ((lid % ts) != tid);
|
|
|
|
teamMember.team_broadcast([&](value_type &var) { var += var; }, value,
|
|
lid % ts);
|
|
teamMember.team_broadcast([&](bool &bVar) { bVar = !bVar; }, setValue,
|
|
lid % ts);
|
|
|
|
Kokkos::parallel_reduce(
|
|
Kokkos::TeamThreadRange(teamMember, ts),
|
|
[&](const int /*j*/, value_type &teamUpdate) { teamUpdate += value; },
|
|
parUpdate);
|
|
|
|
if (teamMember.team_rank() == 0 && setValue) update += parUpdate;
|
|
}
|
|
|
|
template <class ScalarType>
|
|
static inline std::enable_if_t<!std::is_integral<ScalarType>::value, void>
|
|
compare_test(ScalarType A, ScalarType B, double epsilon_factor) {
|
|
if (std::is_same<ScalarType, double>::value ||
|
|
std::is_same<ScalarType, float>::value) {
|
|
ASSERT_NEAR((double)A, (double)B,
|
|
epsilon_factor * std::abs(A) *
|
|
std::numeric_limits<ScalarType>::epsilon());
|
|
} else {
|
|
ASSERT_EQ(A, B);
|
|
}
|
|
}
|
|
|
|
template <class ScalarType>
|
|
static inline std::enable_if_t<std::is_integral<ScalarType>::value, void>
|
|
compare_test(ScalarType A, ScalarType B, double) {
|
|
ASSERT_EQ(A, B);
|
|
}
|
|
|
|
static void test_teambroadcast(const size_t league_size,
|
|
const value_type off) {
|
|
TestTeamBroadcast functor(league_size, off);
|
|
|
|
using policy_type = Kokkos::TeamPolicy<ScheduleType, ExecSpace>;
|
|
using policy_type_f =
|
|
Kokkos::TeamPolicy<ScheduleType, ExecSpace, BroadcastTag>;
|
|
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int fake_team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int fake_team_size = 1;
|
|
#endif
|
|
const int team_size =
|
|
policy_type_f(league_size, fake_team_size)
|
|
.team_size_max(
|
|
functor,
|
|
Kokkos::
|
|
ParallelReduceTag()); // printf("team_size=%d\n",team_size);
|
|
// team_broadcast with value
|
|
value_type total = 0;
|
|
|
|
Kokkos::parallel_reduce(policy_type(league_size, team_size), functor,
|
|
total);
|
|
|
|
value_type expected_result = 0;
|
|
for (unsigned int i = 0; i < league_size; i++) {
|
|
value_type val =
|
|
(value_type((i % team_size) * 3) + off) * value_type(team_size);
|
|
expected_result += val;
|
|
}
|
|
// For comparison purposes treat the reduction as a random walk in the
|
|
// least significant digit, which gives a typical walk distance
|
|
// sqrt(league_size) Add 4x for larger sigma
|
|
compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
|
|
|
|
// team_broadcast with function object
|
|
total = 0;
|
|
|
|
Kokkos::parallel_reduce(policy_type_f(league_size, team_size), functor,
|
|
total);
|
|
|
|
expected_result = 0;
|
|
for (unsigned int i = 0; i < league_size; i++) {
|
|
value_type val = ((value_type)((i % team_size) * 3) + off) *
|
|
(value_type)(2 * team_size);
|
|
expected_result += val;
|
|
}
|
|
// For comparison purposes treat the reduction as a random walk in the
|
|
// least significant digit, which gives a typical walk distance
|
|
// sqrt(league_size) Add 4x for larger sigma
|
|
compare_test(expected_result, total, 4.0 * std::sqrt(league_size));
|
|
}
|
|
};
|
|
|
|
template <class ExecSpace>
|
|
struct TestScratchAlignment {
|
|
struct TestScalar {
|
|
double x, y, z;
|
|
};
|
|
TestScratchAlignment() {
|
|
test_view(true);
|
|
test_view(false);
|
|
test_minimal();
|
|
test_raw();
|
|
}
|
|
using ScratchView =
|
|
Kokkos::View<TestScalar *, typename ExecSpace::scratch_memory_space>;
|
|
using ScratchViewInt =
|
|
Kokkos::View<int *, typename ExecSpace::scratch_memory_space>;
|
|
void test_view(bool allocate_small) {
|
|
int shmem_size = ScratchView::shmem_size(11);
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int team_size = 1;
|
|
#endif
|
|
if (allocate_small) shmem_size += ScratchViewInt::shmem_size(1);
|
|
Kokkos::parallel_for(
|
|
Kokkos::TeamPolicy<ExecSpace>(1, team_size)
|
|
.set_scratch_size(0, Kokkos::PerTeam(shmem_size)),
|
|
KOKKOS_LAMBDA(
|
|
const typename Kokkos::TeamPolicy<ExecSpace>::member_type &team) {
|
|
if (allocate_small) ScratchViewInt(team.team_scratch(0), 1);
|
|
ScratchView a(team.team_scratch(0), 11);
|
|
if (ptrdiff_t(a.data()) % sizeof(TestScalar) != 0)
|
|
Kokkos::abort("Error: invalid scratch view alignment\n");
|
|
});
|
|
Kokkos::fence();
|
|
}
|
|
|
|
// test really small size of scratch space, produced error before
|
|
void test_minimal() {
|
|
using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
|
|
// FIXME_OPENMPTARGET temporary restriction for team size to be at least 32
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int team_size = 1;
|
|
#endif
|
|
Kokkos::TeamPolicy<ExecSpace> policy(1, team_size);
|
|
size_t scratch_size = sizeof(int);
|
|
Kokkos::View<int, ExecSpace> flag("Flag");
|
|
|
|
Kokkos::parallel_for(
|
|
policy.set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
|
|
KOKKOS_LAMBDA(const member_type &team) {
|
|
int *scratch_ptr = (int *)team.team_shmem().get_shmem(scratch_size);
|
|
if (scratch_ptr == nullptr) flag() = 1;
|
|
});
|
|
Kokkos::fence();
|
|
int minimal_scratch_allocation_failed = 0;
|
|
Kokkos::deep_copy(minimal_scratch_allocation_failed, flag);
|
|
ASSERT_EQ(minimal_scratch_allocation_failed, 0);
|
|
}
|
|
|
|
// test alignment of successive allocations
|
|
void test_raw() {
|
|
using member_type = typename Kokkos::TeamPolicy<ExecSpace>::member_type;
|
|
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
|
int team_size =
|
|
std::is_same<ExecSpace, Kokkos::Experimental::OpenMPTarget>::value ? 32
|
|
: 1;
|
|
#else
|
|
int team_size = 1;
|
|
#endif
|
|
Kokkos::TeamPolicy<ExecSpace> policy(1, team_size);
|
|
Kokkos::View<int, ExecSpace> flag("Flag");
|
|
|
|
Kokkos::parallel_for(
|
|
policy.set_scratch_size(0, Kokkos::PerTeam(1024)),
|
|
KOKKOS_LAMBDA(const member_type &team) {
|
|
// first get some unaligned allocations, should give back
|
|
// exactly the requested number of bytes
|
|
auto scratch_ptr1 =
|
|
reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(24));
|
|
auto scratch_ptr2 =
|
|
reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(32));
|
|
auto scratch_ptr3 =
|
|
reinterpret_cast<intptr_t>(team.team_shmem().get_shmem(12));
|
|
|
|
if (((scratch_ptr2 - scratch_ptr1) != 24) ||
|
|
((scratch_ptr3 - scratch_ptr2) != 32))
|
|
flag() = 1;
|
|
|
|
// Now request aligned memory such that the allocation after
|
|
// scratch_ptr2 would be unaligned if it doesn't pad correctly.
|
|
// Depending on scratch_ptr3 being 4 or 8 byte aligned
|
|
// we need to request a different amount of memory.
|
|
if ((scratch_ptr3 + 12) % 8 == 4)
|
|
scratch_ptr1 = reinterpret_cast<intptr_t>(
|
|
team.team_shmem().get_shmem_aligned(24, 4));
|
|
else {
|
|
scratch_ptr1 = reinterpret_cast<intptr_t>(
|
|
team.team_shmem().get_shmem_aligned(12, 4));
|
|
}
|
|
scratch_ptr2 = reinterpret_cast<intptr_t>(
|
|
team.team_shmem().get_shmem_aligned(32, 8));
|
|
scratch_ptr3 = reinterpret_cast<intptr_t>(
|
|
team.team_shmem().get_shmem_aligned(8, 4));
|
|
|
|
// The difference between scratch_ptr2 and scratch_ptr1 should be 4
|
|
// bytes larger than what we requested in either case.
|
|
if (((scratch_ptr2 - scratch_ptr1) != 28) &&
|
|
((scratch_ptr2 - scratch_ptr1) != 16))
|
|
flag() = 1;
|
|
// Check that there wasn't unneccessary padding happening. Since
|
|
// scratch_ptr2 was allocated with a 32 byte request and scratch_ptr3
|
|
// is then already aligned, its difference should match 32 bytes.
|
|
if ((scratch_ptr3 - scratch_ptr2) != 32) flag() = 1;
|
|
|
|
// check actually alignment of ptrs is as requested
|
|
// cast to int here to avoid failure with icpx in mixed integer type
|
|
// comparison
|
|
if ((int(scratch_ptr1 % 4) != 0) || (int(scratch_ptr2 % 8) != 0) ||
|
|
(int(scratch_ptr3 % 4) != 0))
|
|
flag() = 1;
|
|
});
|
|
Kokkos::fence();
|
|
int raw_get_shmem_alignment_failed = 0;
|
|
Kokkos::deep_copy(raw_get_shmem_alignment_failed, flag);
|
|
ASSERT_EQ(raw_get_shmem_alignment_failed, 0);
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace {
|
|
template <class ExecSpace>
|
|
struct TestTeamPolicyHandleByValue {
|
|
using scalar = double;
|
|
using exec_space = ExecSpace;
|
|
using mem_space = typename ExecSpace::memory_space;
|
|
|
|
TestTeamPolicyHandleByValue() { test(); }
|
|
|
|
void test() {
|
|
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
|
const int M = 1, N = 1;
|
|
Kokkos::View<scalar **, mem_space> a("a", M, N);
|
|
Kokkos::View<scalar **, mem_space> b("b", M, N);
|
|
Kokkos::deep_copy(a, 0.0);
|
|
Kokkos::deep_copy(b, 1.0);
|
|
Kokkos::parallel_for(
|
|
"test_tphandle_by_value",
|
|
Kokkos::TeamPolicy<exec_space>(M, Kokkos::AUTO(), 1),
|
|
KOKKOS_LAMBDA(
|
|
const typename Kokkos::TeamPolicy<exec_space>::member_type team) {
|
|
const int i = team.league_rank();
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 0, N),
|
|
[&](const int j) { a(i, j) += b(i, j); });
|
|
});
|
|
#endif
|
|
}
|
|
};
|
|
|
|
} // namespace
|
|
|
|
namespace {
|
|
template <typename ExecutionSpace>
|
|
struct TestRepeatedTeamReduce {
|
|
static constexpr int ncol = 1500; // nothing special, just some work
|
|
|
|
KOKKOS_FUNCTION void operator()(
|
|
const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type &team)
|
|
const {
|
|
// non-divisible by power of two to make triggering problems easier
|
|
constexpr int nlev = 129;
|
|
constexpr auto pi = Kokkos::numbers::pi;
|
|
double b = 0.;
|
|
for (int ri = 0; ri < 10; ++ri) {
|
|
// The contributions here must be sufficiently complex, simply adding ones
|
|
// wasn't enough to trigger the bug.
|
|
const auto g1 = [&](const int k, double &acc) {
|
|
acc += Kokkos::cos(pi * double(k) / nlev);
|
|
};
|
|
const auto g2 = [&](const int k, double &acc) {
|
|
acc += Kokkos::sin(pi * double(k) / nlev);
|
|
};
|
|
double a1, a2;
|
|
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g1, a1);
|
|
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team, nlev), g2, a2);
|
|
b += a1;
|
|
b += a2;
|
|
}
|
|
const auto h = [&]() {
|
|
const auto col = team.league_rank();
|
|
v(col) = b + col;
|
|
};
|
|
Kokkos::single(Kokkos::PerTeam(team), h);
|
|
}
|
|
|
|
KOKKOS_FUNCTION void operator()(const int i, int &bad) const {
|
|
if (v(i) != v(0) + i) {
|
|
++bad;
|
|
Kokkos::printf("Failing at %d!\n", i);
|
|
}
|
|
}
|
|
|
|
TestRepeatedTeamReduce() : v("v", ncol) { test(); }
|
|
|
|
void test() {
|
|
int team_size_recommended =
|
|
Kokkos::TeamPolicy<ExecutionSpace>(1, 1).team_size_recommended(
|
|
*this, Kokkos::ParallelForTag());
|
|
// Choose a non-recommened (non-power of two for GPUs) team size
|
|
int team_size = team_size_recommended > 1 ? team_size_recommended - 1 : 1;
|
|
|
|
// The failure was non-deterministic so run the test a bunch of times
|
|
for (int it = 0; it < 100; ++it) {
|
|
Kokkos::parallel_for(
|
|
Kokkos::TeamPolicy<ExecutionSpace>(ncol, team_size, 1), *this);
|
|
|
|
int bad = 0;
|
|
Kokkos::parallel_reduce(Kokkos::RangePolicy<ExecutionSpace>(0, ncol),
|
|
*this, bad);
|
|
ASSERT_EQ(bad, 0) << " Failing in iteration " << it;
|
|
}
|
|
}
|
|
|
|
Kokkos::View<double *, ExecutionSpace> v;
|
|
};
|
|
|
|
} // namespace
|
|
|
|
} // namespace Test
|
|
|
|
/*--------------------------------------------------------------------------*/
|