Update Kokkos library in LAMMPS to v3.5.0

This commit is contained in:
Stan Gerald Moore
2021-11-04 12:45:59 -06:00
parent 515ef7bece
commit 564098e629
396 changed files with 21892 additions and 8508 deletions

View File

@ -5,9 +5,7 @@ KOKKOS_SUBPACKAGE(Algorithms)
IF (NOT Kokkos_INSTALL_TESTING)
ADD_SUBDIRECTORY(src)
ENDIF()
IF(NOT (KOKKOS_ENABLE_OPENMPTARGET
AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR
KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)))
IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
ENDIF()

View File

@ -447,6 +447,25 @@ struct rand<Generator, unsigned long long> {
}
};
#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
template <class Generator>
struct rand<Generator, Kokkos::Experimental::half_t> {
using half = Kokkos::Experimental::half_t;
KOKKOS_INLINE_FUNCTION
static half max() { return half(1.0); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen) { return half(gen.frand()); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& range) {
return half(gen.frand(float(range)));
}
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& start, const half& end) {
return half(gen.frand(float(start), float(end)));
}
};
#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
template <class Generator>
struct rand<Generator, float> {
KOKKOS_INLINE_FUNCTION
@ -600,7 +619,7 @@ struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
template <class ExecutionSpace>
struct Random_UniqueIndex {
using locks_view_type = View<int*, ExecutionSpace>;
using locks_view_type = View<int**, ExecutionSpace>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type) {
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
@ -615,7 +634,7 @@ struct Random_UniqueIndex {
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct Random_UniqueIndex<Kokkos::Cuda> {
using locks_view_type = View<int*, Kokkos::Cuda>;
using locks_view_type = View<int**, Kokkos::Cuda>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
#ifdef __CUDA_ARCH__
@ -625,7 +644,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
@ -643,7 +662,7 @@ struct Random_UniqueIndex<Kokkos::Cuda> {
#ifdef KOKKOS_ENABLE_HIP
template <>
struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
using locks_view_type = View<int**, Kokkos::Experimental::HIP>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
#ifdef __HIP_DEVICE_COMPILE__
@ -653,7 +672,7 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
@ -671,15 +690,15 @@ struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
#ifdef KOKKOS_ENABLE_SYCL
template <>
struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
using locks_view_type = View<int*, Kokkos::Experimental::SYCL>;
using locks_view_type = View<int**, Kokkos::Experimental::SYCL>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
#ifdef KOKKOS_ARCH_INTEL_GEN
#ifdef KOKKOS_ARCH_INTEL_GPU
int i = Kokkos::Impl::clock_tic() % locks_.extent(0);
#else
int i = 0;
#endif
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i = (i + 1) % static_cast<int>(locks_.extent(0));
}
return i;
@ -690,14 +709,14 @@ struct Random_UniqueIndex<Kokkos::Experimental::SYCL> {
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template <>
struct Random_UniqueIndex<Kokkos::Experimental::OpenMPTarget> {
using locks_view_type = View<int*, Kokkos::Experimental::OpenMPTarget>;
using locks_view_type = View<int**, Kokkos::Experimental::OpenMPTarget>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks) {
const int team_size = omp_get_num_threads();
int i = omp_get_team_num() * team_size + omp_get_thread_num();
const int lock_size = locks.extent_int(0);
while (Kokkos::atomic_compare_exchange(&locks(i), 0, 1)) {
while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
i = (i + 1) % lock_size;
}
return i;
@ -856,18 +875,22 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift64_Pool {
private:
using execution_space = typename DeviceType::execution_space;
using locks_type = View<int*, execution_space>;
using state_data_type = View<uint64_t*, DeviceType>;
using locks_type = View<int**, execution_space>;
using state_data_type = View<uint64_t**, DeviceType>;
locks_type locks_;
state_data_type state_;
int num_states_;
int padding_;
public:
using generator_type = Random_XorShift64<DeviceType>;
using device_type = DeviceType;
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool() { num_states_ = 0; }
Random_XorShift64_Pool() {
num_states_ = 0;
padding_ = 0;
}
Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
@ -883,16 +906,22 @@ class Random_XorShift64_Pool {
locks_ = src.locks_;
state_ = src.state_;
num_states_ = src.num_states_;
padding_ = src.padding_;
return *this;
}
void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
locks_ =
locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
padding_);
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
@ -902,15 +931,15 @@ class Random_XorShift64_Pool {
gen(seed, 0);
for (int i = 0; i < 17; i++) gen.rand();
for (int i = 0; i < num_states_; i++) {
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
(((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
(((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
(((static_cast<uint64_t>(n4)) & 0xffff) << 48);
h_lock(i) = 0;
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
(((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
(((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
(((static_cast<uint64_t>(n4)) & 0xffff) << 48);
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
@ -920,19 +949,19 @@ class Random_XorShift64_Pool {
Random_XorShift64<DeviceType> get_state() const {
const int i =
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
return Random_XorShift64<DeviceType>(state_(i), i);
return Random_XorShift64<DeviceType>(state_(i, 0), i);
}
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift64<DeviceType> get_state(const int state_idx) const {
return Random_XorShift64<DeviceType>(state_(state_idx), state_idx);
return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift64<DeviceType>& state) const {
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
state_(state.state_idx_, 0) = state.state_;
locks_(state.state_idx_, 0) = 0;
}
};
@ -1092,14 +1121,15 @@ template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift1024_Pool {
private:
using execution_space = typename DeviceType::execution_space;
using locks_type = View<int*, execution_space>;
using int_view_type = View<int*, DeviceType>;
using locks_type = View<int**, execution_space>;
using int_view_type = View<int**, DeviceType>;
using state_data_type = View<uint64_t * [16], DeviceType>;
locks_type locks_;
state_data_type state_;
int_view_type p_;
int num_states_;
int padding_;
friend class Random_XorShift1024<DeviceType>;
public:
@ -1129,15 +1159,21 @@ class Random_XorShift1024_Pool {
state_ = src.state_;
p_ = src.p_;
num_states_ = src.num_states_;
padding_ = src.padding_;
return *this;
}
inline void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
locks_ =
locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
@ -1158,8 +1194,8 @@ class Random_XorShift1024_Pool {
(((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
(((static_cast<uint64_t>(n4)) & 0xffff) << 48);
}
h_p(i) = 0;
h_lock(i) = 0;
h_p(i, 0) = 0;
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
@ -1169,20 +1205,20 @@ class Random_XorShift1024_Pool {
Random_XorShift1024<DeviceType> get_state() const {
const int i =
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
return Random_XorShift1024<DeviceType>(state_, p_(i), i);
return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
};
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
return Random_XorShift1024<DeviceType>(state_, p_(state_idx), state_idx);
return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift1024<DeviceType>& state) const {
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
p_(state.state_idx_) = state.p_;
locks_(state.state_idx_) = 0;
p_(state.state_idx_, 0) = state.p_;
locks_(state.state_idx_, 0) = 0;
}
};

View File

@ -319,7 +319,7 @@ class BinSort {
Kokkos::RangePolicy<execution_space>(0, len), functor);
}
execution_space().fence();
execution_space().fence("Kokkos::Sort: fence after sorting");
}
template <class ValuesViewType>
@ -492,7 +492,8 @@ bool try_std_sort(ViewType view) {
view.stride_3(), view.stride_4(), view.stride_5(),
view.stride_6(), view.stride_7()};
possible = possible &&
std::is_same<typename ViewType::memory_space, HostSpace>::value;
SpaceAccessibility<HostSpace,
typename ViewType::memory_space>::accessible;
possible = possible && (ViewType::Rank == 1);
possible = possible && (stride[0] == 1);
if (possible) {

View File

@ -47,7 +47,7 @@
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <impl/Kokkos_Timer.hpp>
#include <Kokkos_Timer.hpp>
#include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp>
#include <cmath>
@ -198,11 +198,50 @@ struct test_random_functor {
static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp2 / theMax);
const uint64_t ind3_3d =
static_cast<uint64_t>(1.0 * HIST_DIM3D * tmp3 / theMax);
// Workaround Intel 17 compiler bug which sometimes add random
// instruction alignment which makes the lock instruction
// illegal. Seems to be mostly just for unsigned int atomics.
// Looking at the assembly the compiler
// appears to insert cache line alignment for the instruction.
// Isn't restricted to specific archs. Seen it on SNB and SKX, but for
// different code. Another occurrence was with Desul atomics in
// a different unit test. This one here happens without desul atomics.
// Inserting an assembly nop instruction changes the alignment and
// works round this.
//
// 17.0.4 for 64bit Random works with 1/1/1/2/1
// 17.0.4 for 1024bit Random works with 1/1/1/1/1
#ifdef KOKKOS_COMPILER_INTEL
#if (KOKKOS_COMPILER_INTEL < 1800)
asm volatile("nop\n");
#endif
#endif
atomic_fetch_add(&density_1d(ind1_1d), 1);
#ifdef KOKKOS_COMPILER_INTEL
#if (KOKKOS_COMPILER_INTEL < 1800)
asm volatile("nop\n");
#endif
#endif
atomic_fetch_add(&density_1d(ind2_1d), 1);
#ifdef KOKKOS_COMPILER_INTEL
#if (KOKKOS_COMPILER_INTEL < 1800)
asm volatile("nop\n");
#endif
#endif
atomic_fetch_add(&density_1d(ind3_1d), 1);
#ifdef KOKKOS_COMPILER_INTEL
#if (KOKKOS_COMPILER_INTEL < 1800)
if (std::is_same<rnd_type, Kokkos::Random_XorShift64<device_type>>::value)
asm volatile("nop\n");
asm volatile("nop\n");
#endif
#endif
atomic_fetch_add(&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
#ifdef KOKKOS_COMPILER_INTEL
#if (KOKKOS_COMPILER_INTEL < 1800)
asm volatile("nop\n");
#endif
#endif
}
rand_pool.free_state(rand_gen);
}
@ -338,9 +377,11 @@ struct test_random_scalar {
using functor_type =
test_histogram1d_functor<typename RandomGenerator::device_type>;
parallel_reduce(HIST_DIM1D, functor_type(density_1d, num_draws), result);
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
double mean_eps_expect = 0.0001;
double variance_eps_expect = 0.07;
double covariance_eps_expect = 0.06;
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
double mean_expect = 1.0 * num_draws * 3 / HIST_DIM1D;
double variance_expect =
1.0 * num_draws * 3 / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
double covariance_expect = -1.0 * num_draws * 3 / HIST_DIM1D / HIST_DIM1D;
@ -349,11 +390,26 @@ struct test_random_scalar {
variance_expect / (result.variance / HIST_DIM1D) - 1.0;
double covariance_eps =
(result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
pass_hist1d_mean = ((-0.0001 < mean_eps) && (0.0001 > mean_eps)) ? 1 : 0;
pass_hist1d_var =
((-0.07 < variance_eps) && (0.07 > variance_eps)) ? 1 : 0;
pass_hist1d_covar =
((-0.06 < covariance_eps) && (0.06 > covariance_eps)) ? 1 : 0;
#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
mean_eps_expect = 0.0003;
variance_eps_expect = 1.0;
covariance_eps_expect = 5.0e4;
}
#endif
pass_hist1d_mean =
((-mean_eps_expect < mean_eps) && (mean_eps_expect > mean_eps)) ? 1
: 0;
pass_hist1d_var = ((-variance_eps_expect < variance_eps) &&
(variance_eps_expect > variance_eps))
? 1
: 0;
pass_hist1d_covar = ((-covariance_eps_expect < covariance_eps) &&
(covariance_eps_expect > covariance_eps))
? 1
: 0;
cout << "Density 1D: " << mean_eps << " " << variance_eps << " "
<< (result.covariance / HIST_DIM1D / HIST_DIM1D) << " || "
@ -371,8 +427,9 @@ struct test_random_scalar {
test_histogram3d_functor<typename RandomGenerator::device_type>;
parallel_reduce(HIST_DIM1D, functor_type(density_3d, num_draws), result);
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
double mean_expect = 1.0 * num_draws / HIST_DIM1D;
double variance_factor = 1.2;
double tolerance = 6 * std::sqrt(1.0 / HIST_DIM1D);
double mean_expect = 1.0 * num_draws / HIST_DIM1D;
double variance_expect =
1.0 * num_draws / HIST_DIM1D * (1.0 - 1.0 / HIST_DIM1D);
double covariance_expect = -1.0 * num_draws / HIST_DIM1D / HIST_DIM1D;
@ -381,15 +438,23 @@ struct test_random_scalar {
variance_expect / (result.variance / HIST_DIM1D) - 1.0;
double covariance_eps =
(result.covariance / HIST_DIM1D - covariance_expect) / mean_expect;
#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
if (std::is_same<Scalar, Kokkos::Experimental::half_t>::value) {
variance_factor = 7;
}
#endif
pass_hist3d_mean =
((-tolerance < mean_eps) && (tolerance > mean_eps)) ? 1 : 0;
pass_hist3d_var = ((-1.2 * tolerance < variance_eps) &&
(1.2 * tolerance > variance_eps))
pass_hist3d_var = ((-variance_factor * tolerance < variance_eps) &&
(variance_factor * tolerance > variance_eps))
? 1
: 0;
pass_hist3d_covar =
((-tolerance < covariance_eps) && (tolerance > covariance_eps)) ? 1
: 0;
pass_hist3d_covar = ((-variance_factor * tolerance < covariance_eps) &&
(variance_factor * tolerance > covariance_eps))
? 1
: 0;
cout << "Density 3D: " << mean_eps << " " << variance_eps << " "
<< result.covariance / HIST_DIM1D / HIST_DIM1D << " || " << tolerance
@ -471,6 +536,21 @@ void test_random(unsigned int num_draws) {
deep_copy(density_1d, 0);
deep_copy(density_3d, 0);
cout << "Test Scalar=half" << endl;
test_random_scalar<RandomGenerator, Kokkos::Experimental::half_t> test_half(
density_1d, density_3d, pool, num_draws);
ASSERT_EQ(test_half.pass_mean, 1);
ASSERT_EQ(test_half.pass_var, 1);
ASSERT_EQ(test_half.pass_covar, 1);
ASSERT_EQ(test_half.pass_hist1d_mean, 1);
ASSERT_EQ(test_half.pass_hist1d_var, 1);
ASSERT_EQ(test_half.pass_hist1d_covar, 1);
ASSERT_EQ(test_half.pass_hist3d_mean, 1);
ASSERT_EQ(test_half.pass_hist3d_var, 1);
ASSERT_EQ(test_half.pass_hist3d_covar, 1);
deep_copy(density_1d, 0);
deep_copy(density_3d, 0);
cout << "Test Scalar=float" << endl;
test_random_scalar<RandomGenerator, float> test_float(density_1d, density_3d,
pool, num_draws);