Update Kokkos library in LAMMPS to v3.1

This commit is contained in:
Stan Moore
2020-04-16 09:06:08 -06:00
parent fa6922a182
commit ba8d043c7e
554 changed files with 24628 additions and 14871 deletions

View File

@ -537,6 +537,145 @@ struct rand<Generator, Kokkos::complex<double> > {
}
};
template <class DeviceType>
class Random_XorShift1024_Pool;
namespace Impl {
template <bool UseCArrayState>
struct Random_XorShift1024_State {
uint64_t state_[16];
KOKKOS_DEFAULTED_FUNCTION
Random_XorShift1024_State() = default;
template <class StateViewType>
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx) {
for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i);
}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i]; }
};
template <>
struct Random_XorShift1024_State<false> {
uint64_t* state_;
const int stride_;
KOKKOS_FUNCTION
Random_XorShift1024_State() : state_(nullptr), stride_(1){};
template <class StateViewType>
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx)
: state_(&v(state_idx, 0)), stride_(v.stride_1()) {}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i * stride_]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i * stride_]; }
};
template <class ExecutionSpace>
struct Random_XorShift1024_UseCArrayState : std::true_type {};
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::Cuda> : std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_HIP
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::HIP>
: std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
: std::false_type {};
#endif
template <class ExecutionSpace>
struct Random_UniqueIndex {
using locks_view_type = View<int*, ExecutionSpace>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type) {
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
const int i = ExecutionSpace::hardware_thread_id();
#else
const int i = ExecutionSpace::impl_hardware_thread_id();
#endif
return i;
#else
return 0;
#endif
}
};
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct Random_UniqueIndex<Kokkos::Cuda> {
using locks_view_type = View<int*, Kokkos::Cuda>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
#ifdef __CUDA_ARCH__
const int i_offset =
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
}
}
return i;
#else
(void)locks_;
return 0;
#endif
}
};
#endif
#ifdef KOKKOS_ENABLE_HIP
template <>
struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
#ifdef __HIP_DEVICE_COMPILE__
const int i_offset =
(hipThreadIdx_x * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_z +
hipThreadIdx_z;
int i = (((hipBlockIdx_x * hipGridDim_y + hipBlockIdx_y) * hipGridDim_z +
hipBlockIdx_z) *
hipBlockDim_x * hipBlockDim_y * hipBlockDim_z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
}
}
return i;
#else
(void)locks_;
return 0;
#endif
}
};
#endif
} // namespace Impl
template <class DeviceType>
class Random_XorShift64_Pool;
@ -550,10 +689,10 @@ class Random_XorShift64 {
public:
typedef DeviceType device_type;
enum { MAX_URAND = 0xffffffffU };
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
enum { MAX_RAND = static_cast<int>(0xffffffff / 2) };
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL / 2 - 1) };
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift64(uint64_t state, int state_idx = 0)
@ -637,10 +776,12 @@ class Random_XorShift64 {
}
KOKKOS_INLINE_FUNCTION
float frand() { return 1.0f * urand64() / MAX_URAND64; }
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
float frand(const float& range) {
return range * urand64() / static_cast<float>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
@ -648,10 +789,12 @@ class Random_XorShift64 {
}
KOKKOS_INLINE_FUNCTION
double drand() { return 1.0 * urand64() / MAX_URAND64; }
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
double drand(const double& range) {
return range * urand64() / static_cast<double>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
@ -662,6 +805,11 @@ class Random_XorShift64 {
// number
KOKKOS_INLINE_FUNCTION
double normal() {
#ifndef __HIP_DEVICE_COMPILE__ // FIXME_HIP
using std::sqrt;
#else
using ::sqrt;
#endif
double S = 2.0;
double U;
while (S >= 1.0) {
@ -669,7 +817,7 @@ class Random_XorShift64 {
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * log(S) / S);
return U * sqrt(-2.0 * log(S) / S);
}
KOKKOS_INLINE_FUNCTION
@ -681,9 +829,10 @@ class Random_XorShift64 {
template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift64_Pool {
private:
typedef View<int*, DeviceType> lock_type;
using execution_space = typename DeviceType::execution_space;
typedef View<int*, execution_space> locks_type;
typedef View<uint64_t*, DeviceType> state_data_type;
lock_type locks_;
locks_type locks_;
state_data_type state_;
int num_states_;
@ -695,11 +844,8 @@ class Random_XorShift64_Pool {
Random_XorShift64_Pool() { num_states_ = 0; }
Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
init(seed, DeviceType::max_hardware_threads());
#else
init(seed, DeviceType::impl_max_hardware_threads());
#endif
init(seed, execution_space().concurrency());
}
KOKKOS_INLINE_FUNCTION
@ -719,11 +865,11 @@ class Random_XorShift64_Pool {
num_states_ = num_states;
locks_ = lock_type("Kokkos::Random_XorShift64::locks", num_states_);
locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
typename lock_type::HostMirror h_lock = create_mirror_view(locks_);
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
// Execute on the HostMirror's default execution space.
Random_XorShift64<typename state_data_type::HostMirror::execution_space>
@ -746,13 +892,8 @@ class Random_XorShift64_Pool {
KOKKOS_INLINE_FUNCTION
Random_XorShift64<DeviceType> get_state() const {
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
const int i = DeviceType::hardware_thread_id();
;
#else
const int i = DeviceType::impl_hardware_thread_id();
;
#endif
const int i =
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
return Random_XorShift64<DeviceType>(state_(i), i);
}
@ -765,35 +906,35 @@ class Random_XorShift64_Pool {
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift64<DeviceType>& state) const {
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
}
};
template <class DeviceType>
class Random_XorShift1024_Pool;
template <class DeviceType>
class Random_XorShift1024 {
using execution_space = typename DeviceType::execution_space;
private:
int p_;
const int state_idx_;
uint64_t state_[16];
Impl::Random_XorShift1024_State<
Impl::Random_XorShift1024_UseCArrayState<execution_space>::value>
state_;
friend class Random_XorShift1024_Pool<DeviceType>;
public:
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
typedef DeviceType device_type;
enum { MAX_URAND = 0xffffffffU };
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
int state_idx = 0)
: p_(p), state_idx_(state_idx) {
for (int i = 0; i < 16; i++) state_[i] = state(state_idx, i);
}
: p_(p), state_idx_(state_idx), state_(state, state_idx) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
@ -876,10 +1017,12 @@ class Random_XorShift1024 {
}
KOKKOS_INLINE_FUNCTION
float frand() { return 1.0f * urand64() / MAX_URAND64; }
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
float frand(const float& range) {
return range * urand64() / static_cast<float>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
@ -887,10 +1030,12 @@ class Random_XorShift1024 {
}
KOKKOS_INLINE_FUNCTION
double drand() { return 1.0 * urand64() / MAX_URAND64; }
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
double drand(const double& range) {
return range * urand64() / static_cast<double>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
@ -901,6 +1046,11 @@ class Random_XorShift1024 {
// number
KOKKOS_INLINE_FUNCTION
double normal() {
#ifndef KOKKOS_ENABLE_HIP // FIXME_HIP
using std::sqrt;
#else
using ::sqrt;
#endif
double S = 2.0;
double U;
while (S >= 1.0) {
@ -908,7 +1058,7 @@ class Random_XorShift1024 {
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * log(S) / S);
return U * sqrt(-2.0 * log(S) / S);
}
KOKKOS_INLINE_FUNCTION
@ -920,10 +1070,12 @@ class Random_XorShift1024 {
template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift1024_Pool {
private:
using execution_space = typename DeviceType::execution_space;
typedef View<int*, execution_space> locks_type;
typedef View<int*, DeviceType> int_view_type;
typedef View<uint64_t * [16], DeviceType> state_data_type;
int_view_type locks_;
locks_type locks_;
state_data_type state_;
int_view_type p_;
int num_states_;
@ -939,11 +1091,8 @@ class Random_XorShift1024_Pool {
inline Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
init(seed, DeviceType::max_hardware_threads());
#else
init(seed, DeviceType::impl_max_hardware_threads());
#endif
init(seed, execution_space().concurrency());
}
KOKKOS_INLINE_FUNCTION
@ -965,12 +1114,12 @@ class Random_XorShift1024_Pool {
inline void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
num_states_ = num_states;
locks_ = int_view_type("Kokkos::Random_XorShift1024::locks", num_states_);
locks_ = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
typename int_view_type::HostMirror h_lock = create_mirror_view(locks_);
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
typename int_view_type::HostMirror h_p = create_mirror_view(p_);
// Execute on the HostMirror's default execution space.
@ -997,11 +1146,8 @@ class Random_XorShift1024_Pool {
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state() const {
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
const int i = DeviceType::hardware_thread_id();
#else
const int i = DeviceType::impl_hardware_thread_id();
#endif
const int i =
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
return Random_XorShift1024<DeviceType>(state_, p_(i), i);
};
@ -1014,482 +1160,11 @@ class Random_XorShift1024_Pool {
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift1024<DeviceType>& state) const {
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
p_(state.state_idx_) = state.p_;
p_(state.state_idx_) = state.p_;
locks_(state.state_idx_) = 0;
}
};
#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
template <>
class Random_XorShift1024<Kokkos::Cuda> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
public:
typedef Kokkos::Cuda device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum { MAX_URAND = 0xffffffffU };
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
KOKKOS_INLINE_FUNCTION
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
int state_idx = 0)
: p_(p),
state_idx_(state_idx),
state_(&state(state_idx, 0)),
stride_(state.stride_1()) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[p_ * stride_];
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp =
(state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
tmp = tmp >> 16;
return static_cast<uint32_t>(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[p_ * stride_];
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return ((state_[p_ * stride_] = state_0 ^ state_1) *
1181783497276652981LL) -
1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast<int>(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return 1.0f * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return 1.0 * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return frand(end - start) + start;
}
// Marsaglia polar method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while (S >= 1.0) {
U = 2.0 * drand() - 1.0;
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * log(S) / S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template <>
inline Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(
uint64_t seed) {
num_states_ = 0;
init(seed, 4 * 32768);
}
template <>
KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Cuda>
Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
#ifdef __CUDA_ARCH__
const int i_offset =
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
num_states_;
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= num_states_) {
i = i_offset;
}
}
return Random_XorShift64<Kokkos::Cuda>(state_(i), i);
#else
return Random_XorShift64<Kokkos::Cuda>(state_(0), 0);
#endif
}
template <>
KOKKOS_INLINE_FUNCTION void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(
const Random_XorShift64<Kokkos::Cuda>& state) const {
state_(state.state_idx_) = state.state_;
#ifdef __CUDA_ARCH__
locks_(state.state_idx_) = 0;
return;
#endif
}
template <>
inline Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(
uint64_t seed) {
num_states_ = 0;
init(seed, 4 * 32768);
}
template <>
KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Cuda>
Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
#ifdef __CUDA_ARCH__
const int i_offset =
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
num_states_;
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= num_states_) {
i = i_offset;
}
}
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
#endif
}
template <>
KOKKOS_INLINE_FUNCTION void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(
const Random_XorShift1024<Kokkos::Cuda>& state) const {
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
#ifdef __CUDA_ARCH__
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif
#if defined(KOKKOS_ENABLE_ROCM)
template <>
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
public:
typedef Kokkos::Experimental::ROCm device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum { MAX_URAND = 0xffffffffU };
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
KOKKOS_INLINE_FUNCTION
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
int state_idx = 0)
: p_(p),
state_idx_(state_idx),
state_(&state(state_idx, 0)),
stride_(state.stride_1()) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[p_ * stride_];
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp =
(state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
tmp = tmp >> 16;
return static_cast<uint32_t>(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[p_ * stride_];
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return ((state_[p_ * stride_] = state_0 ^ state_1) *
1181783497276652981LL) -
1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast<int>(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return 1.0f * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return 1.0 * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return frand(end - start) + start;
}
// Marsaglia polar method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while (S >= 1.0) {
U = 2.0 * drand() - 1.0;
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * log(S) / S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template <>
inline Random_XorShift64_Pool<
Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, 4 * 32768);
}
template <>
KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Experimental::ROCm>
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset =
(threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
blockDim_x * blockDim_y * blockDim_z +
i_offset) %
num_states_;
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += blockDim_x * blockDim_y * blockDim_z;
if (i >= num_states_) {
i = i_offset;
}
}
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i), i);
#else
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0), 0);
#endif
}
template <>
KOKKOS_INLINE_FUNCTION void
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(
const Random_XorShift64<Kokkos::Experimental::ROCm>& state) const {
#ifdef __HCC_ACCELERATOR__
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
return;
#endif
}
template <>
inline Random_XorShift1024_Pool<
Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, 4 * 32768);
}
template <>
KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Experimental::ROCm>
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset =
(threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
blockDim_x * blockDim_y * blockDim_z +
i_offset) %
num_states_;
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
i += blockDim_x * blockDim_y * blockDim_z;
if (i >= num_states_) {
i = i_offset;
}
}
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}
template <>
KOKKOS_INLINE_FUNCTION void
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(
const Random_XorShift1024<Kokkos::Experimental::ROCm>& state) const {
#ifdef __HCC_ACCELERATOR__
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif
namespace Impl {
template <class ViewType, class RandomPool, int loops, int rank,
@ -2043,7 +1718,7 @@ void fill_random(ViewType a, RandomPool g,
typename ViewType::const_value_type range) {
int64_t LDA = a.extent(0);
if (LDA > 0)
parallel_for((LDA + 127) / 128,
parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
Impl::fill_random_functor_range<ViewType, RandomPool, 128,
ViewType::Rank, IndexType>(
a, g, range));
@ -2055,7 +1730,7 @@ void fill_random(ViewType a, RandomPool g,
typename ViewType::const_value_type end) {
int64_t LDA = a.extent(0);
if (LDA > 0)
parallel_for((LDA + 127) / 128,
parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
ViewType::Rank, IndexType>(
a, g, begin, end));