Update Kokkos library in LAMMPS to v3.1
This commit is contained in:
@ -537,6 +537,145 @@ struct rand<Generator, Kokkos::complex<double> > {
|
||||
}
|
||||
};
|
||||
|
||||
template <class DeviceType>
|
||||
class Random_XorShift1024_Pool;
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template <bool UseCArrayState>
|
||||
struct Random_XorShift1024_State {
|
||||
uint64_t state_[16];
|
||||
KOKKOS_DEFAULTED_FUNCTION
|
||||
Random_XorShift1024_State() = default;
|
||||
|
||||
template <class StateViewType>
|
||||
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
|
||||
int state_idx) {
|
||||
for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i);
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
uint64_t operator[](const int i) const { return state_[i]; }
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
uint64_t& operator[](const int i) { return state_[i]; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct Random_XorShift1024_State<false> {
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
KOKKOS_FUNCTION
|
||||
Random_XorShift1024_State() : state_(nullptr), stride_(1){};
|
||||
|
||||
template <class StateViewType>
|
||||
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
|
||||
int state_idx)
|
||||
: state_(&v(state_idx, 0)), stride_(v.stride_1()) {}
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
uint64_t operator[](const int i) const { return state_[i * stride_]; }
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
uint64_t& operator[](const int i) { return state_[i * stride_]; }
|
||||
};
|
||||
|
||||
template <class ExecutionSpace>
|
||||
struct Random_XorShift1024_UseCArrayState : std::true_type {};
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
template <>
|
||||
struct Random_XorShift1024_UseCArrayState<Kokkos::Cuda> : std::false_type {};
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
template <>
|
||||
struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::HIP>
|
||||
: std::false_type {};
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
template <>
|
||||
struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
|
||||
: std::false_type {};
|
||||
#endif
|
||||
|
||||
template <class ExecutionSpace>
|
||||
struct Random_UniqueIndex {
|
||||
using locks_view_type = View<int*, ExecutionSpace>;
|
||||
KOKKOS_FUNCTION
|
||||
static int get_state_idx(const locks_view_type) {
|
||||
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
const int i = ExecutionSpace::hardware_thread_id();
|
||||
#else
|
||||
const int i = ExecutionSpace::impl_hardware_thread_id();
|
||||
#endif
|
||||
return i;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
template <>
|
||||
struct Random_UniqueIndex<Kokkos::Cuda> {
|
||||
using locks_view_type = View<int*, Kokkos::Cuda>;
|
||||
KOKKOS_FUNCTION
|
||||
static int get_state_idx(const locks_view_type& locks_) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
const int i_offset =
|
||||
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
|
||||
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
|
||||
blockDim.x * blockDim.y * blockDim.z +
|
||||
i_offset) %
|
||||
locks_.extent(0);
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += blockDim.x * blockDim.y * blockDim.z;
|
||||
if (i >= static_cast<int>(locks_.extent(0))) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
#else
|
||||
(void)locks_;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HIP
|
||||
template <>
|
||||
struct Random_UniqueIndex<Kokkos::Experimental::HIP> {
|
||||
using locks_view_type = View<int*, Kokkos::Experimental::HIP>;
|
||||
KOKKOS_FUNCTION
|
||||
static int get_state_idx(const locks_view_type& locks_) {
|
||||
#ifdef __HIP_DEVICE_COMPILE__
|
||||
const int i_offset =
|
||||
(hipThreadIdx_x * hipBlockDim_y + hipThreadIdx_y) * hipBlockDim_z +
|
||||
hipThreadIdx_z;
|
||||
int i = (((hipBlockIdx_x * hipGridDim_y + hipBlockIdx_y) * hipGridDim_z +
|
||||
hipBlockIdx_z) *
|
||||
hipBlockDim_x * hipBlockDim_y * hipBlockDim_z +
|
||||
i_offset) %
|
||||
locks_.extent(0);
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
|
||||
if (i >= static_cast<int>(locks_.extent(0))) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
return i;
|
||||
#else
|
||||
(void)locks_;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
template <class DeviceType>
|
||||
class Random_XorShift64_Pool;
|
||||
|
||||
@ -550,10 +689,10 @@ class Random_XorShift64 {
|
||||
public:
|
||||
typedef DeviceType device_type;
|
||||
|
||||
enum { MAX_URAND = 0xffffffffU };
|
||||
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
|
||||
enum { MAX_RAND = static_cast<int>(0xffffffff / 2) };
|
||||
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL / 2 - 1) };
|
||||
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
|
||||
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
|
||||
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
|
||||
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64(uint64_t state, int state_idx = 0)
|
||||
@ -637,10 +776,12 @@ class Random_XorShift64 {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() { return 1.0f * urand64() / MAX_URAND64; }
|
||||
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
|
||||
float frand(const float& range) {
|
||||
return range * urand64() / static_cast<float>(MAX_URAND64);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end) {
|
||||
@ -648,10 +789,12 @@ class Random_XorShift64 {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() { return 1.0 * urand64() / MAX_URAND64; }
|
||||
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
|
||||
double drand(const double& range) {
|
||||
return range * urand64() / static_cast<double>(MAX_URAND64);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end) {
|
||||
@ -662,6 +805,11 @@ class Random_XorShift64 {
|
||||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
#ifndef __HIP_DEVICE_COMPILE__ // FIXME_HIP
|
||||
using std::sqrt;
|
||||
#else
|
||||
using ::sqrt;
|
||||
#endif
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
@ -669,7 +817,7 @@ class Random_XorShift64 {
|
||||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
return U * sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -681,9 +829,10 @@ class Random_XorShift64 {
|
||||
template <class DeviceType = Kokkos::DefaultExecutionSpace>
|
||||
class Random_XorShift64_Pool {
|
||||
private:
|
||||
typedef View<int*, DeviceType> lock_type;
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
typedef View<int*, execution_space> locks_type;
|
||||
typedef View<uint64_t*, DeviceType> state_data_type;
|
||||
lock_type locks_;
|
||||
locks_type locks_;
|
||||
state_data_type state_;
|
||||
int num_states_;
|
||||
|
||||
@ -695,11 +844,8 @@ class Random_XorShift64_Pool {
|
||||
Random_XorShift64_Pool() { num_states_ = 0; }
|
||||
Random_XorShift64_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
init(seed, DeviceType::max_hardware_threads());
|
||||
#else
|
||||
init(seed, DeviceType::impl_max_hardware_threads());
|
||||
#endif
|
||||
|
||||
init(seed, execution_space().concurrency());
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -719,11 +865,11 @@ class Random_XorShift64_Pool {
|
||||
|
||||
num_states_ = num_states;
|
||||
|
||||
locks_ = lock_type("Kokkos::Random_XorShift64::locks", num_states_);
|
||||
locks_ = locks_type("Kokkos::Random_XorShift64::locks", num_states_);
|
||||
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_);
|
||||
|
||||
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
|
||||
typename lock_type::HostMirror h_lock = create_mirror_view(locks_);
|
||||
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
|
||||
|
||||
// Execute on the HostMirror's default execution space.
|
||||
Random_XorShift64<typename state_data_type::HostMirror::execution_space>
|
||||
@ -746,13 +892,8 @@ class Random_XorShift64_Pool {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64<DeviceType> get_state() const {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
const int i = DeviceType::hardware_thread_id();
|
||||
;
|
||||
#else
|
||||
const int i = DeviceType::impl_hardware_thread_id();
|
||||
;
|
||||
#endif
|
||||
const int i =
|
||||
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
|
||||
return Random_XorShift64<DeviceType>(state_(i), i);
|
||||
}
|
||||
|
||||
@ -765,35 +906,35 @@ class Random_XorShift64_Pool {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void free_state(const Random_XorShift64<DeviceType>& state) const {
|
||||
state_(state.state_idx_) = state.state_;
|
||||
locks_(state.state_idx_) = 0;
|
||||
}
|
||||
};
|
||||
|
||||
template <class DeviceType>
|
||||
class Random_XorShift1024_Pool;
|
||||
|
||||
template <class DeviceType>
|
||||
class Random_XorShift1024 {
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
|
||||
private:
|
||||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t state_[16];
|
||||
Impl::Random_XorShift1024_State<
|
||||
Impl::Random_XorShift1024_UseCArrayState<execution_space>::value>
|
||||
state_;
|
||||
friend class Random_XorShift1024_Pool<DeviceType>;
|
||||
|
||||
public:
|
||||
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
|
||||
typedef DeviceType device_type;
|
||||
|
||||
enum { MAX_URAND = 0xffffffffU };
|
||||
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
|
||||
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
|
||||
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
|
||||
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
|
||||
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
|
||||
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
|
||||
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
|
||||
int state_idx = 0)
|
||||
: p_(p), state_idx_(state_idx) {
|
||||
for (int i = 0; i < 16; i++) state_[i] = state(state_idx, i);
|
||||
}
|
||||
: p_(p), state_idx_(state_idx), state_(state, state_idx) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
@ -876,10 +1017,12 @@ class Random_XorShift1024 {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() { return 1.0f * urand64() / MAX_URAND64; }
|
||||
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
|
||||
float frand(const float& range) {
|
||||
return range * urand64() / static_cast<float>(MAX_URAND64);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end) {
|
||||
@ -887,10 +1030,12 @@ class Random_XorShift1024 {
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() { return 1.0 * urand64() / MAX_URAND64; }
|
||||
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
|
||||
double drand(const double& range) {
|
||||
return range * urand64() / static_cast<double>(MAX_URAND64);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end) {
|
||||
@ -901,6 +1046,11 @@ class Random_XorShift1024 {
|
||||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
#ifndef KOKKOS_ENABLE_HIP // FIXME_HIP
|
||||
using std::sqrt;
|
||||
#else
|
||||
using ::sqrt;
|
||||
#endif
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
@ -908,7 +1058,7 @@ class Random_XorShift1024 {
|
||||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
return U * sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -920,10 +1070,12 @@ class Random_XorShift1024 {
|
||||
template <class DeviceType = Kokkos::DefaultExecutionSpace>
|
||||
class Random_XorShift1024_Pool {
|
||||
private:
|
||||
using execution_space = typename DeviceType::execution_space;
|
||||
typedef View<int*, execution_space> locks_type;
|
||||
typedef View<int*, DeviceType> int_view_type;
|
||||
typedef View<uint64_t * [16], DeviceType> state_data_type;
|
||||
|
||||
int_view_type locks_;
|
||||
locks_type locks_;
|
||||
state_data_type state_;
|
||||
int_view_type p_;
|
||||
int num_states_;
|
||||
@ -939,11 +1091,8 @@ class Random_XorShift1024_Pool {
|
||||
|
||||
inline Random_XorShift1024_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
init(seed, DeviceType::max_hardware_threads());
|
||||
#else
|
||||
init(seed, DeviceType::impl_max_hardware_threads());
|
||||
#endif
|
||||
|
||||
init(seed, execution_space().concurrency());
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -965,12 +1114,12 @@ class Random_XorShift1024_Pool {
|
||||
inline void init(uint64_t seed, int num_states) {
|
||||
if (seed == 0) seed = uint64_t(1318319);
|
||||
num_states_ = num_states;
|
||||
locks_ = int_view_type("Kokkos::Random_XorShift1024::locks", num_states_);
|
||||
locks_ = locks_type("Kokkos::Random_XorShift1024::locks", num_states_);
|
||||
state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
|
||||
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_);
|
||||
|
||||
typename state_data_type::HostMirror h_state = create_mirror_view(state_);
|
||||
typename int_view_type::HostMirror h_lock = create_mirror_view(locks_);
|
||||
typename locks_type::HostMirror h_lock = create_mirror_view(locks_);
|
||||
typename int_view_type::HostMirror h_p = create_mirror_view(p_);
|
||||
|
||||
// Execute on the HostMirror's default execution space.
|
||||
@ -997,11 +1146,8 @@ class Random_XorShift1024_Pool {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024<DeviceType> get_state() const {
|
||||
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
|
||||
const int i = DeviceType::hardware_thread_id();
|
||||
#else
|
||||
const int i = DeviceType::impl_hardware_thread_id();
|
||||
#endif
|
||||
const int i =
|
||||
Impl::Random_UniqueIndex<execution_space>::get_state_idx(locks_);
|
||||
return Random_XorShift1024<DeviceType>(state_, p_(i), i);
|
||||
};
|
||||
|
||||
@ -1014,482 +1160,11 @@ class Random_XorShift1024_Pool {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void free_state(const Random_XorShift1024<DeviceType>& state) const {
|
||||
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
|
||||
p_(state.state_idx_) = state.p_;
|
||||
p_(state.state_idx_) = state.p_;
|
||||
locks_(state.state_idx_) = 0;
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
|
||||
|
||||
template <>
|
||||
class Random_XorShift1024<Kokkos::Cuda> {
|
||||
private:
|
||||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
|
||||
|
||||
public:
|
||||
typedef Kokkos::Cuda device_type;
|
||||
typedef Random_XorShift1024_Pool<device_type> pool_type;
|
||||
|
||||
enum { MAX_URAND = 0xffffffffU };
|
||||
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
|
||||
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
|
||||
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
|
||||
int state_idx = 0)
|
||||
: p_(p),
|
||||
state_idx_(state_idx),
|
||||
state_(&state(state_idx, 0)),
|
||||
stride_(state.stride_1()) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
uint64_t state_0 = state_[p_ * stride_];
|
||||
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
uint64_t tmp =
|
||||
(state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
|
||||
tmp = tmp >> 16;
|
||||
return static_cast<uint32_t>(tmp & MAX_URAND);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64() {
|
||||
uint64_t state_0 = state_[p_ * stride_];
|
||||
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
return ((state_[p_ * stride_] = state_0 ^ state_1) *
|
||||
1181783497276652981LL) -
|
||||
1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& range) {
|
||||
const uint32_t max_val = (MAX_URAND / range) * range;
|
||||
uint32_t tmp = urand();
|
||||
while (tmp >= max_val) urand();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& start, const uint32_t& end) {
|
||||
return urand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& range) {
|
||||
const uint64_t max_val = (MAX_URAND64 / range) * range;
|
||||
uint64_t tmp = urand64();
|
||||
while (tmp >= max_val) urand64();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
|
||||
return urand64(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand() { return static_cast<int>(urand() / 2); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& range) {
|
||||
const int max_val = (MAX_RAND / range) * range;
|
||||
int tmp = rand();
|
||||
while (tmp >= max_val) rand();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& start, const int& end) {
|
||||
return rand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& range) {
|
||||
const int64_t max_val = (MAX_RAND64 / range) * range;
|
||||
int64_t tmp = rand64();
|
||||
while (tmp >= max_val) rand64();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& start, const int64_t& end) {
|
||||
return rand64(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() { return 1.0f * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end) {
|
||||
return frand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() { return 1.0 * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end) {
|
||||
return frand(end - start) + start;
|
||||
}
|
||||
|
||||
// Marsaglia polar method for drawing a standard normal distributed random
|
||||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
U = 2.0 * drand() - 1.0;
|
||||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal(const double& mean, const double& std_dev = 1.0) {
|
||||
return mean + normal() * std_dev;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(
|
||||
uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed, 4 * 32768);
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Cuda>
|
||||
Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
const int i_offset =
|
||||
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
|
||||
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
|
||||
blockDim.x * blockDim.y * blockDim.z +
|
||||
i_offset) %
|
||||
num_states_;
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += blockDim.x * blockDim.y * blockDim.z;
|
||||
if (i >= num_states_) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return Random_XorShift64<Kokkos::Cuda>(state_(i), i);
|
||||
#else
|
||||
return Random_XorShift64<Kokkos::Cuda>(state_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(
|
||||
const Random_XorShift64<Kokkos::Cuda>& state) const {
|
||||
state_(state.state_idx_) = state.state_;
|
||||
#ifdef __CUDA_ARCH__
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(
|
||||
uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed, 4 * 32768);
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Cuda>
|
||||
Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
|
||||
#ifdef __CUDA_ARCH__
|
||||
const int i_offset =
|
||||
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
|
||||
int i = (((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
|
||||
blockDim.x * blockDim.y * blockDim.z +
|
||||
i_offset) %
|
||||
num_states_;
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += blockDim.x * blockDim.y * blockDim.z;
|
||||
if (i >= num_states_) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
|
||||
#else
|
||||
return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(
|
||||
const Random_XorShift1024<Kokkos::Cuda>& state) const {
|
||||
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
|
||||
#ifdef __CUDA_ARCH__
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(KOKKOS_ENABLE_ROCM)
|
||||
|
||||
template <>
|
||||
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
|
||||
private:
|
||||
int p_;
|
||||
const int state_idx_;
|
||||
uint64_t* state_;
|
||||
const int stride_;
|
||||
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
|
||||
|
||||
public:
|
||||
typedef Kokkos::Experimental::ROCm device_type;
|
||||
typedef Random_XorShift1024_Pool<device_type> pool_type;
|
||||
|
||||
enum { MAX_URAND = 0xffffffffU };
|
||||
enum { MAX_URAND64 = 0xffffffffffffffffULL - 1 };
|
||||
enum { MAX_RAND = static_cast<int>(0xffffffffU / 2) };
|
||||
enum { MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL / 2 - 1) };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
|
||||
int state_idx = 0)
|
||||
: p_(p),
|
||||
state_idx_(state_idx),
|
||||
state_(&state(state_idx, 0)),
|
||||
stride_(state.stride_1()) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
uint64_t state_0 = state_[p_ * stride_];
|
||||
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
uint64_t tmp =
|
||||
(state_[p_ * stride_] = state_0 ^ state_1) * 1181783497276652981ULL;
|
||||
tmp = tmp >> 16;
|
||||
return static_cast<uint32_t>(tmp & MAX_URAND);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64() {
|
||||
uint64_t state_0 = state_[p_ * stride_];
|
||||
uint64_t state_1 = state_[(p_ = (p_ + 1) & 15) * stride_];
|
||||
state_1 ^= state_1 << 31;
|
||||
state_1 ^= state_1 >> 11;
|
||||
state_0 ^= state_0 >> 30;
|
||||
return ((state_[p_ * stride_] = state_0 ^ state_1) *
|
||||
1181783497276652981LL) -
|
||||
1;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& range) {
|
||||
const uint32_t max_val = (MAX_URAND / range) * range;
|
||||
uint32_t tmp = urand();
|
||||
while (tmp >= max_val) urand();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand(const uint32_t& start, const uint32_t& end) {
|
||||
return urand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& range) {
|
||||
const uint64_t max_val = (MAX_URAND64 / range) * range;
|
||||
uint64_t tmp = urand64();
|
||||
while (tmp >= max_val) urand64();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
|
||||
return urand64(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand() { return static_cast<int>(urand() / 2); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& range) {
|
||||
const int max_val = (MAX_RAND / range) * range;
|
||||
int tmp = rand();
|
||||
while (tmp >= max_val) rand();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int rand(const int& start, const int& end) {
|
||||
return rand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& range) {
|
||||
const int64_t max_val = (MAX_RAND64 / range) * range;
|
||||
int64_t tmp = rand64();
|
||||
while (tmp >= max_val) rand64();
|
||||
return tmp % range;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t rand64(const int64_t& start, const int64_t& end) {
|
||||
return rand64(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand() { return 1.0f * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& range) { return range * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float frand(const float& start, const float& end) {
|
||||
return frand(end - start) + start;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand() { return 1.0 * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& range) { return range * urand64() / MAX_URAND64; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double drand(const double& start, const double& end) {
|
||||
return frand(end - start) + start;
|
||||
}
|
||||
|
||||
// Marsaglia polar method for drawing a standard normal distributed random
|
||||
// number
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal() {
|
||||
double S = 2.0;
|
||||
double U;
|
||||
while (S >= 1.0) {
|
||||
U = 2.0 * drand() - 1.0;
|
||||
const double V = 2.0 * drand() - 1.0;
|
||||
S = U * U + V * V;
|
||||
}
|
||||
return U * std::sqrt(-2.0 * log(S) / S);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double normal(const double& mean, const double& std_dev = 1.0) {
|
||||
return mean + normal() * std_dev;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
inline Random_XorShift64_Pool<
|
||||
Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed, 4 * 32768);
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION Random_XorShift64<Kokkos::Experimental::ROCm>
|
||||
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset =
|
||||
(threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
|
||||
int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
|
||||
blockDim_x * blockDim_y * blockDim_z +
|
||||
i_offset) %
|
||||
num_states_;
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += blockDim_x * blockDim_y * blockDim_z;
|
||||
if (i >= num_states_) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i), i);
|
||||
#else
|
||||
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION void
|
||||
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(
|
||||
const Random_XorShift64<Kokkos::Experimental::ROCm>& state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
state_(state.state_idx_) = state.state_;
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
inline Random_XorShift1024_Pool<
|
||||
Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
|
||||
num_states_ = 0;
|
||||
init(seed, 4 * 32768);
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION Random_XorShift1024<Kokkos::Experimental::ROCm>
|
||||
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
const int i_offset =
|
||||
(threadIdx_x * blockDim_y + threadIdx_y) * blockDim_z + threadIdx_z;
|
||||
int i = (((blockIdx_x * gridDim_y + blockIdx_y) * gridDim_z + blockIdx_z) *
|
||||
blockDim_x * blockDim_y * blockDim_z +
|
||||
i_offset) %
|
||||
num_states_;
|
||||
while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) {
|
||||
i += blockDim_x * blockDim_y * blockDim_z;
|
||||
if (i >= num_states_) {
|
||||
i = i_offset;
|
||||
}
|
||||
}
|
||||
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
|
||||
#else
|
||||
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
KOKKOS_INLINE_FUNCTION void
|
||||
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(
|
||||
const Random_XorShift1024<Kokkos::Experimental::ROCm>& state) const {
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
|
||||
locks_(state.state_idx_) = 0;
|
||||
return;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template <class ViewType, class RandomPool, int loops, int rank,
|
||||
@ -2043,7 +1718,7 @@ void fill_random(ViewType a, RandomPool g,
|
||||
typename ViewType::const_value_type range) {
|
||||
int64_t LDA = a.extent(0);
|
||||
if (LDA > 0)
|
||||
parallel_for((LDA + 127) / 128,
|
||||
parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
|
||||
Impl::fill_random_functor_range<ViewType, RandomPool, 128,
|
||||
ViewType::Rank, IndexType>(
|
||||
a, g, range));
|
||||
@ -2055,7 +1730,7 @@ void fill_random(ViewType a, RandomPool g,
|
||||
typename ViewType::const_value_type end) {
|
||||
int64_t LDA = a.extent(0);
|
||||
if (LDA > 0)
|
||||
parallel_for((LDA + 127) / 128,
|
||||
parallel_for("Kokkos::fill_random", (LDA + 127) / 128,
|
||||
Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
|
||||
ViewType::Rank, IndexType>(
|
||||
a, g, begin, end));
|
||||
|
||||
Reference in New Issue
Block a user