Files
lammps/lib/kokkos/algorithms/src/Kokkos_Random.hpp
2024-12-13 09:23:03 -07:00

1571 lines
55 KiB
C++

//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER
#ifndef KOKKOS_RANDOM_HPP
#define KOKKOS_RANDOM_HPP
#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
#define KOKKOS_IMPL_PUBLIC_INCLUDE
#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
#endif
#include <Kokkos_Core.hpp>
#include <Kokkos_Complex.hpp>
#include <cstdio>
#include <cstdlib>
#include <cmath>
/// \file Kokkos_Random.hpp
/// \brief Pseudorandom number generators
///
/// These generators are based on Vigna, Sebastiano (2014). "An
/// experimental exploration of Marsaglia's xorshift generators,
/// scrambled." See: http://arxiv.org/abs/1402.6246
namespace Kokkos {
// clang-format off
/*Template functions to get equidistributed random numbers from a generator for a specific Scalar type
template<class Generator,Scalar>
struct rand{
//Max value returned by draw(Generator& gen)
KOKKOS_INLINE_FUNCTION
static Scalar max();
//Returns a value between zero and max()
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen);
//Returns a value between zero and range()
//Note: for floating point values range can be larger than max()
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen, const Scalar& range){}
//Return value between start and end
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end);
};
The Random number generators themselves have two components a state-pool and the actual generator
A state-pool manages a number of generators, so that each active thread is able to grep its own.
This allows the generation of random numbers which are independent between threads. Note that
in contrast to CuRand none of the functions of the pool (or the generator) are collectives,
i.e. all functions can be called inside conditionals.
template<class Device>
class Pool {
public:
//The Kokkos device type
using device_type = Device;
//The actual generator type
using generator_type = Generator<Device>;
//Default constructor: does not initialize a pool
Pool();
//Initializing constructor: calls init(seed,Device_Specific_Number);
Pool(unsigned int seed);
//Initialize Pool with seed as a starting seed with a pool_size of num_states
//The Random_XorShift64 generator is used in serial to initialize all states,
//thus the initialization process is platform independent and deterministic.
void init(unsigned int seed, int num_states);
//Get a generator. This will lock one of the states, guaranteeing that each thread
//will have its private generator. Note: on Cuda getting a state involves atomics,
//and is thus not deterministic!
generator_type get_state();
//Give a state back to the pool. This unlocks the state, and writes the modified
//state of the generator back to the pool.
void free_state(generator_type gen);
}
template<class Device>
class Generator {
public:
//The Kokkos device type
using device_type = DeviceType;
//Max return values of respective [X]rand[S]() functions
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
//Init with a state and the idx with respect to pool. Note: in serial the
//Generator can be used by just giving it the necessary state arguments
KOKKOS_INLINE_FUNCTION
Generator (STATE_ARGUMENTS, int state_idx = 0);
//Draw a equidistributed uint32_t in the range [0,MAX_URAND)
KOKKOS_INLINE_FUNCTION
uint32_t urand();
//Draw a equidistributed uint64_t in the range [0,MAX_URAND64)
KOKKOS_INLINE_FUNCTION
uint64_t urand64();
//Draw a equidistributed uint32_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range);
//Draw a equidistributed uint32_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end );
//Draw a equidistributed uint64_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range);
//Draw a equidistributed uint64_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end );
//Draw a equidistributed int in the range [0,MAX_RAND)
KOKKOS_INLINE_FUNCTION
int rand();
//Draw a equidistributed int in the range [0,range)
KOKKOS_INLINE_FUNCTION
int rand(const int& range);
//Draw a equidistributed int in the range [start,end)
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end );
//Draw a equidistributed int64_t in the range [0,MAX_RAND64)
KOKKOS_INLINE_FUNCTION
int64_t rand64();
//Draw a equidistributed int64_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range);
//Draw a equidistributed int64_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end );
//Draw a equidistributed float in the range [0,1.0)
KOKKOS_INLINE_FUNCTION
float frand();
//Draw a equidistributed float in the range [0,range)
KOKKOS_INLINE_FUNCTION
float frand(const float& range);
//Draw a equidistributed float in the range [start,end)
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end );
//Draw a equidistributed double in the range [0,1.0)
KOKKOS_INLINE_FUNCTION
double drand();
//Draw a equidistributed double in the range [0,range)
KOKKOS_INLINE_FUNCTION
double drand(const double& range);
//Draw a equidistributed double in the range [start,end)
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end );
//Draw a standard normal distributed double
KOKKOS_INLINE_FUNCTION
double normal() ;
//Draw a normal distributed double with given mean and standard deviation
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0);
}
//Additional Functions:
//Fills view with random numbers in the range [0,range)
template<class ViewType, class PoolType>
void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
//Fills view with random numbers in the range [start,end)
template<class ViewType, class PoolType>
void fill_random(ViewType view, PoolType pool,
ViewType::value_type start, ViewType::value_type end);
*/
// clang-format on
template <class Generator, class Scalar>
struct rand;
template <class Generator>
struct rand<Generator, char> {
KOKKOS_INLINE_FUNCTION
static short max() { return 127; }
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen) {
return short((gen.rand() & 0xff + 256) % 256);
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const char& range) {
return char(gen.rand(range));
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const char& start, const char& end) {
return char(gen.rand(start, end));
}
};
template <class Generator>
struct rand<Generator, short> {
KOKKOS_INLINE_FUNCTION
static short max() { return 32767; }
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen) {
return short((gen.rand() & 0xffff + 65536) % 32768);
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const short& range) {
return short(gen.rand(range));
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const short& start, const short& end) {
return short(gen.rand(start, end));
}
};
template <class Generator>
struct rand<Generator, int> {
KOKKOS_INLINE_FUNCTION
static int max() { return Generator::MAX_RAND; }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen) { return gen.rand(); }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen, const int& range) { return gen.rand(range); }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen, const int& start, const int& end) {
return gen.rand(start, end);
}
};
template <class Generator>
struct rand<Generator, unsigned int> {
KOKKOS_INLINE_FUNCTION
static unsigned int max() { return Generator::MAX_URAND; }
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen) { return gen.urand(); }
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen, const unsigned int& range) {
return gen.urand(range);
}
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen, const unsigned int& start,
const unsigned int& end) {
return gen.urand(start, end);
}
};
template <class Generator>
struct rand<Generator, long> {
KOKKOS_INLINE_FUNCTION
static long max() {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4 ? static_cast<long>(Generator::MAX_RAND)
: static_cast<long>(Generator::MAX_RAND64);
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4 ? static_cast<long>(gen.rand())
: static_cast<long>(gen.rand64());
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen, const long& range) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4
? static_cast<long>(gen.rand(static_cast<int>(range)))
: static_cast<long>(gen.rand64(range));
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen, const long& start, const long& end) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4
? static_cast<long>(
gen.rand(static_cast<int>(start), static_cast<int>(end)))
: static_cast<long>(gen.rand64(start, end));
}
};
template <class Generator>
struct rand<Generator, unsigned long> {
KOKKOS_INLINE_FUNCTION
static unsigned long max() {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast<unsigned long>(Generator::MAX_URAND)
: static_cast<unsigned long>(Generator::MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast<unsigned long>(gen.urand())
: static_cast<unsigned long>(gen.urand64());
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen, const unsigned long& range) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast<unsigned long>(
gen.urand(static_cast<unsigned int>(range)))
: static_cast<unsigned long>(gen.urand64(range));
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen, const unsigned long& start,
const unsigned long& end) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast<unsigned long>(
gen.urand(static_cast<unsigned int>(start),
static_cast<unsigned int>(end)))
: static_cast<unsigned long>(gen.urand64(start, end));
}
};
// NOTE (mfh 26 oct 2014) This is a partial specialization for long
// long, a C99 / C++11 signed type which is guaranteed to be at
// least 64 bits. Do NOT write a partial specialization for
// int64_t!!! This is just an alias! It could be either long or
// long long. We don't know which a priori, and I've seen both.
// The types long and long long are guaranteed to differ, so it's
// always safe to specialize for both.
template <class Generator>
struct rand<Generator, long long> {
KOKKOS_INLINE_FUNCTION
static long long max() {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return Generator::MAX_RAND64;
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64();
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen, const long long& range) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64(range);
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen, const long long& start,
const long long& end) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64(start, end);
}
};
// NOTE (mfh 26 oct 2014) This is a partial specialization for
// unsigned long long, a C99 / C++11 unsigned type which is
// guaranteed to be at least 64 bits. Do NOT write a partial
// specialization for uint64_t!!! This is just an alias! It could
// be either unsigned long or unsigned long long. We don't know
// which a priori, and I've seen both. The types unsigned long and
// unsigned long long are guaranteed to differ, so it's always safe
// to specialize for both.
template <class Generator>
struct rand<Generator, unsigned long long> {
KOKKOS_INLINE_FUNCTION
static unsigned long long max() {
// FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64
// bits.
return Generator::MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64
// bits.
return gen.urand64();
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen,
const unsigned long long& range) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.urand64(range);
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen,
const unsigned long long& start,
const unsigned long long& end) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.urand64(start, end);
}
};
#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
template <class Generator>
struct rand<Generator, Kokkos::Experimental::half_t> {
using half = Kokkos::Experimental::half_t;
KOKKOS_INLINE_FUNCTION
static half max() { return half(1.0); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen) { return half(gen.frand()); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& range) {
return half(gen.frand(float(range)));
}
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& start, const half& end) {
return half(gen.frand(float(start), float(end)));
}
};
#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
template <class Generator>
struct rand<Generator, Kokkos::Experimental::bhalf_t> {
using bhalf = Kokkos::Experimental::bhalf_t;
KOKKOS_INLINE_FUNCTION
static bhalf max() { return bhalf(1.0); }
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen) { return bhalf(gen.frand()); }
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen, const bhalf& range) {
return bhalf(gen.frand(float(range)));
}
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen, const bhalf& start, const bhalf& end) {
return bhalf(gen.frand(float(start), float(end)));
}
};
#endif // defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
template <class Generator>
struct rand<Generator, float> {
KOKKOS_INLINE_FUNCTION
static float max() { return 1.0f; }
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen) { return gen.frand(); }
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen, const float& range) {
return gen.frand(range);
}
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen, const float& start, const float& end) {
return gen.frand(start, end);
}
};
template <class Generator>
struct rand<Generator, double> {
KOKKOS_INLINE_FUNCTION
static double max() { return 1.0; }
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen) { return gen.drand(); }
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen, const double& range) {
return gen.drand(range);
}
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen, const double& start, const double& end) {
return gen.drand(start, end);
}
};
template <class Generator>
struct rand<Generator, Kokkos::complex<float>> {
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<float> max() {
return Kokkos::complex<float>(1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<float> draw(Generator& gen) {
const float re = gen.frand();
const float im = gen.frand();
return Kokkos::complex<float>(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<float> draw(Generator& gen,
const Kokkos::complex<float>& range) {
const float re = gen.frand(real(range));
const float im = gen.frand(imag(range));
return Kokkos::complex<float>(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<float> draw(Generator& gen,
const Kokkos::complex<float>& start,
const Kokkos::complex<float>& end) {
const float re = gen.frand(real(start), real(end));
const float im = gen.frand(imag(start), imag(end));
return Kokkos::complex<float>(re, im);
}
};
template <class Generator>
struct rand<Generator, Kokkos::complex<double>> {
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<double> max() {
return Kokkos::complex<double>(1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<double> draw(Generator& gen) {
const double re = gen.drand();
const double im = gen.drand();
return Kokkos::complex<double>(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<double> draw(Generator& gen,
const Kokkos::complex<double>& range) {
const double re = gen.drand(real(range));
const double im = gen.drand(imag(range));
return Kokkos::complex<double>(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex<double> draw(Generator& gen,
const Kokkos::complex<double>& start,
const Kokkos::complex<double>& end) {
const double re = gen.drand(real(start), real(end));
const double im = gen.drand(imag(start), imag(end));
return Kokkos::complex<double>(re, im);
}
};
template <class DeviceType>
class Random_XorShift1024_Pool;
namespace Impl {
template <bool UseCArrayState>
struct Random_XorShift1024_State {
uint64_t state_[16];
KOKKOS_DEFAULTED_FUNCTION
Random_XorShift1024_State() = default;
template <class StateViewType>
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx) {
for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i);
}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i]; }
};
template <>
struct Random_XorShift1024_State<false> {
uint64_t* state_;
const int stride_;
KOKKOS_FUNCTION
Random_XorShift1024_State() : state_(nullptr), stride_(1){};
template <class StateViewType>
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx)
: state_(&v(state_idx, 0)), stride_(v.stride_1()) {}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i * stride_]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i * stride_]; }
};
template <class ExecutionSpace>
struct Random_XorShift1024_UseCArrayState : std::true_type {};
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::Cuda> : std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_HIP
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::HIP> : std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template <>
struct Random_XorShift1024_UseCArrayState<Kokkos::Experimental::OpenMPTarget>
: std::false_type {};
#endif
template <class DeviceType>
struct Random_UniqueIndex {
using locks_view_type = View<int**, DeviceType>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type&) {
KOKKOS_IF_ON_HOST(
(return DeviceType::execution_space::impl_hardware_thread_id();))
KOKKOS_IF_ON_DEVICE((return 0;))
}
};
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
#if defined(KOKKOS_ENABLE_CUDA)
#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Cuda
#elif defined(KOKKOS_ENABLE_HIP)
#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::HIP
#endif
template <class MemorySpace>
struct Random_UniqueIndex<
Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP, MemorySpace>> {
using locks_view_type =
View<int**, Kokkos::Device<KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP,
MemorySpace>>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
KOKKOS_IF_ON_DEVICE((
const int i_offset =
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
int i =
(((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
}
}
return i;))
KOKKOS_IF_ON_HOST(((void)locks_; return 0;))
}
};
#undef KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP
#endif
#ifdef KOKKOS_ENABLE_SYCL
template <class MemorySpace>
struct Random_UniqueIndex<Kokkos::Device<Kokkos::SYCL, MemorySpace>> {
using locks_view_type =
View<int**, Kokkos::Device<Kokkos::SYCL, MemorySpace>>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
auto item = sycl::ext::oneapi::experimental::this_nd_item<3>();
std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1),
item.get_local_id(0)};
std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1),
item.get_group(0)};
std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1),
item.get_local_range(0)};
std::size_t gridDim[3] = {
item.get_global_range(2) / item.get_local_range(2),
item.get_global_range(1) / item.get_local_range(1),
item.get_global_range(0) / item.get_local_range(0)};
const int i_offset =
(threadIdx[0] * blockDim[1] + threadIdx[1]) * blockDim[2] +
threadIdx[2];
int i =
(((blockIdx[0] * gridDim[1] + blockIdx[1]) * gridDim[2] + blockIdx[2]) *
blockDim[0] * blockDim[1] * blockDim[2] +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim[0] * blockDim[1] * blockDim[2];
if (i >= static_cast<int>(locks_.extent(0))) {
i = i_offset;
}
}
return i;
}
};
#endif
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template <class MemorySpace>
struct Random_UniqueIndex<
Kokkos::Device<Kokkos::Experimental::OpenMPTarget, MemorySpace>> {
using locks_view_type =
View<int**,
Kokkos::Device<Kokkos::Experimental::OpenMPTarget, MemorySpace>>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks) {
const int team_size = omp_get_num_threads();
int i = omp_get_team_num() * team_size + omp_get_thread_num();
const int lock_size = locks.extent_int(0);
while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
i = (i + 1) % lock_size;
}
return i;
}
};
#endif
} // namespace Impl
template <class DeviceType>
class Random_XorShift64_Pool;
template <class DeviceType>
class Random_XorShift64 {
private:
uint64_t state_;
const int state_idx_;
friend class Random_XorShift64_Pool<DeviceType>;
public:
using device_type = DeviceType;
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift64(uint64_t state, int state_idx = 0)
: state_(state == 0 ? uint64_t(1318319) : state), state_idx_(state_idx) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
state_ ^= state_ >> 12;
state_ ^= state_ << 25;
state_ ^= state_ >> 27;
uint64_t tmp = state_ * 2685821657736338717ULL;
tmp = tmp >> 16;
return static_cast<uint32_t>(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
state_ ^= state_ >> 12;
state_ ^= state_ << 25;
state_ ^= state_ >> 27;
return (state_ * 2685821657736338717ULL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) tmp = urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) tmp = urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast<int>(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) tmp = rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) tmp = rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64() / static_cast<float>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64() / static_cast<double>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return drand(end - start) + start;
}
// Box-muller method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
const double u = drand();
const double v = drand();
const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u));
const double theta = v * two_pi;
return r * Kokkos::cos(theta);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift64_Pool {
public:
using device_type = typename DeviceType::device_type;
private:
using execution_space = typename device_type::execution_space;
using locks_type = View<int**, device_type>;
using state_data_type = View<uint64_t**, device_type>;
locks_type locks_ = {};
state_data_type state_ = {};
int num_states_ = {};
int padding_ = {};
public:
using generator_type = Random_XorShift64<DeviceType>;
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool() = default;
KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool(
Random_XorShift64_Pool const&) = default;
KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool& operator=(
Random_XorShift64_Pool const&) = default;
#else
Random_XorShift64_Pool() = default;
#endif
Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, execution_space().concurrency());
}
void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ =
locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
padding_);
typename state_data_type::HostMirror h_state =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, state_);
typename locks_type::HostMirror h_lock =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, locks_);
// Execute on the HostMirror's default execution space.
Random_XorShift64<typename state_data_type::HostMirror::execution_space>
gen(seed, 0);
for (int i = 0; i < 17; i++) gen.rand();
for (int i = 0; i < num_states_; i++) {
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i, 0) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
(((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
(((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
(((static_cast<uint64_t>(n4)) & 0xffff) << 48);
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
}
KOKKOS_INLINE_FUNCTION Random_XorShift64<DeviceType> get_state() const {
KOKKOS_EXPECTS(num_states_ > 0);
const int i = Impl::Random_UniqueIndex<device_type>::get_state_idx(locks_);
return Random_XorShift64<DeviceType>(state_(i, 0), i);
}
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift64<DeviceType> get_state(const int state_idx) const {
return Random_XorShift64<DeviceType>(state_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift64<DeviceType>& state) const {
state_(state.state_idx_, 0) = state.state_;
// Release the lock only after the state has been updated in memory
Kokkos::memory_fence();
locks_(state.state_idx_, 0) = 0;
}
};
template <class DeviceType>
class Random_XorShift1024 {
using execution_space = typename DeviceType::execution_space;
private:
int p_;
const int state_idx_;
Impl::Random_XorShift1024_State<
Impl::Random_XorShift1024_UseCArrayState<execution_space>::value>
state_;
friend class Random_XorShift1024_Pool<DeviceType>;
public:
using pool_type = Random_XorShift1024_Pool<DeviceType>;
using device_type = DeviceType;
constexpr static uint32_t MAX_URAND = std::numeric_limits<uint32_t>::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits<uint64_t>::max();
constexpr static int32_t MAX_RAND = std::numeric_limits<int32_t>::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits<int64_t>::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
int state_idx = 0)
: p_(p), state_idx_(state_idx), state_(state, state_idx) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[p_];
uint64_t state_1 = state_[p_ = (p_ + 1) & 15];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = (state_[p_] = state_0 ^ state_1) * 1181783497276652981ULL;
tmp = tmp >> 16;
return static_cast<uint32_t>(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[p_];
uint64_t state_1 = state_[p_ = (p_ + 1) & 15];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return ((state_[p_] = state_0 ^ state_1) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) tmp = urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) tmp = urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast<int>(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) tmp = rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast<int64_t>(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) tmp = rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return urand64() / static_cast<float>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64() / static_cast<float>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return urand64() / static_cast<double>(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64() / static_cast<double>(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return drand(end - start) + start;
}
// Box-muller method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
constexpr auto two_pi = 2 * Kokkos::numbers::pi_v<double>;
const double u = drand();
const double v = drand();
const double r = Kokkos::sqrt(-2.0 * Kokkos::log(u));
const double theta = v * two_pi;
return r * Kokkos::cos(theta);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template <class DeviceType = Kokkos::DefaultExecutionSpace>
class Random_XorShift1024_Pool {
public:
using device_type = typename DeviceType::device_type;
private:
using execution_space = typename device_type::execution_space;
using locks_type = View<int**, device_type>;
using int_view_type = View<int**, device_type>;
using state_data_type = View<uint64_t* [16], device_type>;
locks_type locks_ = {};
state_data_type state_ = {};
int_view_type p_ = {};
int num_states_ = {};
int padding_ = {};
friend class Random_XorShift1024<DeviceType>;
public:
using generator_type = Random_XorShift1024<DeviceType>;
#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool() = default;
KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool(
Random_XorShift1024_Pool const&) = default;
KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool& operator=(
Random_XorShift1024_Pool const&) = default;
#else
Random_XorShift1024_Pool() = default;
#endif
Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, execution_space().concurrency());
}
void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ =
locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
typename state_data_type::HostMirror h_state =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, state_);
typename locks_type::HostMirror h_lock =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, locks_);
typename int_view_type::HostMirror h_p =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, p_);
// Execute on the HostMirror's default execution space.
Random_XorShift64<typename state_data_type::HostMirror::execution_space>
gen(seed, 0);
for (int i = 0; i < 17; i++) gen.rand();
for (int i = 0; i < num_states_; i++) {
for (int j = 0; j < 16; j++) {
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i, j) = (((static_cast<uint64_t>(n1)) & 0xffff) << 00) |
(((static_cast<uint64_t>(n2)) & 0xffff) << 16) |
(((static_cast<uint64_t>(n3)) & 0xffff) << 32) |
(((static_cast<uint64_t>(n4)) & 0xffff) << 48);
}
h_p(i, 0) = 0;
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state() const {
KOKKOS_EXPECTS(num_states_ > 0);
const int i = Impl::Random_UniqueIndex<device_type>::get_state_idx(locks_);
return Random_XorShift1024<DeviceType>(state_, p_(i, 0), i);
};
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
return Random_XorShift1024<DeviceType>(state_, p_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift1024<DeviceType>& state) const {
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
p_(state.state_idx_, 0) = state.p_;
// Release the lock only after the state has been updated in memory
Kokkos::memory_fence();
locks_(state.state_idx_, 0) = 0;
}
};
namespace Impl {
template <class ViewType, class RandomPool, int loops, int rank,
class IndexType>
struct fill_random_functor_begin_end;
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 0,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
a() = Rand::draw(gen, begin, end);
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 1,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0)))
a(idx) = Rand::draw(gen, begin, end);
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 2,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
a(idx, k) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 3,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++)
a(idx, k, l) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 4,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++)
a(idx, k, l, m) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 5,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(1)); l++)
for (IndexType m = 0; m < static_cast<IndexType>(a.extent(2)); m++)
for (IndexType n = 0; n < static_cast<IndexType>(a.extent(3)); n++)
for (IndexType o = 0; o < static_cast<IndexType>(a.extent(4));
o++)
a(idx, l, m, n, o) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 6,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++)
for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4));
n++)
for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5));
o++)
a(idx, k, l, m, n, o) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 7,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++)
for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4));
n++)
for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5));
o++)
for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6));
p++)
a(idx, k, l, m, n, o, p) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType, RandomPool, loops, 8,
IndexType> {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand<typename RandomPool::generator_type,
typename ViewType::non_const_value_type>;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast<IndexType>(a.extent(0))) {
for (IndexType k = 0; k < static_cast<IndexType>(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast<IndexType>(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast<IndexType>(a.extent(3)); m++)
for (IndexType n = 0; n < static_cast<IndexType>(a.extent(4));
n++)
for (IndexType o = 0; o < static_cast<IndexType>(a.extent(5));
o++)
for (IndexType p = 0; p < static_cast<IndexType>(a.extent(6));
p++)
for (IndexType q = 0;
q < static_cast<IndexType>(a.extent(7)); q++)
a(idx, k, l, m, n, o, p, q) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template <class ExecutionSpace, class ViewType, class RandomPool,
class IndexType = int64_t>
void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
typename ViewType::const_value_type begin,
typename ViewType::const_value_type end) {
int64_t LDA = a.extent(0);
if (LDA > 0)
parallel_for(
"Kokkos::fill_random",
Kokkos::RangePolicy<ExecutionSpace>(exec, 0, (LDA + 127) / 128),
Impl::fill_random_functor_begin_end<ViewType, RandomPool, 128,
ViewType::rank, IndexType>(
a, g, begin, end));
}
} // namespace Impl
template <class ExecutionSpace, class ViewType, class RandomPool,
class IndexType = int64_t>
void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
typename ViewType::const_value_type begin,
typename ViewType::const_value_type end) {
Impl::apply_to_view_of_static_rank(
[&](auto dst) { Kokkos::Impl::fill_random(exec, dst, g, begin, end); },
a);
}
template <class ExecutionSpace, class ViewType, class RandomPool,
class IndexType = int64_t>
void fill_random(const ExecutionSpace& exec, ViewType a, RandomPool g,
typename ViewType::const_value_type range) {
fill_random(exec, a, g, 0, range);
}
template <class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g,
typename ViewType::const_value_type begin,
typename ViewType::const_value_type end) {
Kokkos::fence(
"fill_random: fence before since no execution space instance provided");
typename ViewType::execution_space exec;
fill_random(exec, a, g, begin, end);
exec.fence(
"fill_random: fence after since no execution space instance provided");
}
template <class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g,
typename ViewType::const_value_type range) {
Kokkos::fence(
"fill_random: fence before since no execution space instance provided");
typename ViewType::execution_space exec;
fill_random(exec, a, g, 0, range);
exec.fence(
"fill_random: fence after since no execution space instance provided");
}
} // namespace Kokkos
#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
#undef KOKKOS_IMPL_PUBLIC_INCLUDE
#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
#endif
#endif