diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp new file mode 100644 index 0000000000..5b13b8cbe9 --- /dev/null +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -0,0 +1,1551 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos +// Manycore Performance-Portable Multidimensional Arrays +// +// Copyright (2012) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + + +#include +#ifdef KOKKOS_HAVE_CUDA +#include +#endif +#include +#include +#include + +#ifndef KOKKOS_RANDOM_HPP +#define KOKKOS_RANDOM_HPP + +// These generators are based on Vigna, Sebastiano (2014). "An experimental exploration of Marsaglia's xorshift generators, scrambled" +// See: http://arxiv.org/abs/1402.6246 + +namespace Kokkos { + + /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type + + template + struct rand{ + + //Max value returned by draw(Generator& gen) + KOKKOS_INLINE_FUNCTION + static Scalar max(); + + //Returns a value between zero and max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen); + + //Returns a value between zero and range() + //Note: for floating point values range can be larger than max() + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& range){} + + //Return value between start and end + KOKKOS_INLINE_FUNCTION + static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end); + }; + + The Random number generators themselves have two components a state-pool and the actual generator + A state-pool manages a number of generators, so that each active thread is able to grep its own. + This allows the generation of random numbers which are independent between threads. Note that + in contrast to CuRand none of the functions of the pool (or the generator) are collectives, + i.e. all functions can be called inside conditionals. + + template + class Pool { + public: + //The Kokkos device type + typedef Device device_type; + //The actual generator type + typedef Generator generator_type; + + //Default constructor: does not initialize a pool + Pool(); + + //Initializing constructor: calls init(seed,Device_Specific_Number); + Pool(unsigned int seed); + + //Intialize Pool with seed as a starting seed with a pool_size of num_states + //The Random_XorShift64 generator is used in serial to initialize all states, + //thus the intialization process is platform independent and deterministic. + void init(unsigned int seed, int num_states); + + //Get a generator. This will lock one of the states, guaranteeing that each thread + //will have its private generator. Note: on Cuda getting a state involves atomics, + //and is thus not deterministic! + generator_type get_state(); + + //Give a state back to the pool. This unlocks the state, and writes the modified + //state of the generator back to the pool. + void free_state(generator_type gen); + + } + + template + class Generator { + public: + //The Kokkos device type + typedef DeviceType device_type; + + //Max return values of respective [X]rand[S]() functions + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + + //Init with a state and the idx with respect to pool. Note: in serial the + //Generator can be used by just giving it the necessary state arguments + KOKKOS_INLINE_FUNCTION + Generator (STATE_ARGUMENTS, int state_idx = 0); + + //Draw a equidistributed uint32_t in the range (0,MAX_URAND] + KOKKOS_INLINE_FUNCTION + uint32_t urand(); + + //Draw a equidistributed uint64_t in the range (0,MAX_URAND64] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(); + + //Draw a equidistributed uint32_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range); + + //Draw a equidistributed uint32_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ); + + //Draw a equidistributed uint64_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range); + + //Draw a equidistributed uint64_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ); + + //Draw a equidistributed int in the range (0,MAX_RAND] + KOKKOS_INLINE_FUNCTION + int rand(); + + //Draw a equidistributed int in the range (0,range] + KOKKOS_INLINE_FUNCTION + int rand(const int& range); + + //Draw a equidistributed int in the range (start,end] + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ); + + //Draw a equidistributed int64_t in the range (0,MAX_RAND64] + KOKKOS_INLINE_FUNCTION + int64_t rand64(); + + //Draw a equidistributed int64_t in the range (0,range] + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range); + + //Draw a equidistributed int64_t in the range (start,end] + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ); + + //Draw a equidistributed float in the range (0,1.0] + KOKKOS_INLINE_FUNCTION + float frand(); + + //Draw a equidistributed float in the range (0,range] + KOKKOS_INLINE_FUNCTION + float frand(const float& range); + + //Draw a equidistributed float in the range (start,end] + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ); + + //Draw a equidistributed double in the range (0,1.0] + KOKKOS_INLINE_FUNCTION + double drand(); + + //Draw a equidistributed double in the range (0,range] + KOKKOS_INLINE_FUNCTION + double drand(const double& range); + + //Draw a equidistributed double in the range (start,end] + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ); + + //Draw a standard normal distributed double + KOKKOS_INLINE_FUNCTION + double normal() ; + + //Draw a normal distributed double with given mean and standard deviation + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0); + } + + //Additional Functions: + + //Fills view with random numbers in the range (0,range] + template + void fill_random(ViewType view, PoolType pool, ViewType::value_type range); + + //Fills view with random numbers in the range (start,end] + template + void fill_random(ViewType view, PoolType pool, + ViewType::value_type start, ViewType::value_type end); + +*/ + + template + struct rand; + + + template + struct rand { + + KOKKOS_INLINE_FUNCTION + static short max(){return 127;} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) + {return short((gen.rand()&0xff+256)%256);} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& range) + {return char(gen.rand(range));} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const char& start, const char& end) + {return char(gen.rand(start,end));} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static short max(){return 32767;} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen) + {return short((gen.rand()&0xffff+65536)%32768);} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& range) + {return short(gen.rand(range));} + KOKKOS_INLINE_FUNCTION + static short draw(Generator& gen, const short& start, const short& end) + {return short(gen.rand(start,end));} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static int max(){return Generator::MAX_RAND;} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen) + {return gen.rand();} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& range) + {return gen.rand(range);} + KOKKOS_INLINE_FUNCTION + static int draw(Generator& gen, const int& start, const int& end) + {return gen.rand(start,end);} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static unsigned int max(){return Generator::MAX_URAND;} + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen) + {return gen.urand();} + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen, const unsigned int& range) + {return gen.urand(range);} + KOKKOS_INLINE_FUNCTION + static unsigned int draw(Generator& gen, const unsigned int& start, const unsigned int& end) + {return gen.urand(start,end);} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static int64_t max(){return Generator::MAX_RAND64;} + KOKKOS_INLINE_FUNCTION + static int64_t draw(Generator& gen) + {return gen.rand64();} + KOKKOS_INLINE_FUNCTION + static int64_t draw(Generator& gen, const int64_t& range) + {return gen.rand64(range);} + KOKKOS_INLINE_FUNCTION + static int64_t draw(Generator& gen, const int64_t& start, const int64_t& end) + {return gen.rand64(start,end);} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static uint64_t max(){return Generator::MAX_URAND64;} + KOKKOS_INLINE_FUNCTION + static uint64_t draw(Generator& gen) + {return gen.urand64();} + KOKKOS_INLINE_FUNCTION + static uint64_t draw(Generator& gen, const uint64_t& range) + {return gen.urand64(range);} + KOKKOS_INLINE_FUNCTION + static uint64_t draw(Generator& gen, const uint64_t& start, const uint64_t& end) + {return gen.urand64(start,end);} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static float max(){return 1.0f;} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen) + {return gen.frand();} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& range) + {return gen.frand(range);} + KOKKOS_INLINE_FUNCTION + static float draw(Generator& gen, const float& start, const float& end) + {return gen.frand(start,end);} + + }; + + template + struct rand { + KOKKOS_INLINE_FUNCTION + static double max(){return 1.0;} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen) + {return gen.drand();} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& range) + {return gen.drand(range);} + KOKKOS_INLINE_FUNCTION + static double draw(Generator& gen, const double& start, const double& end) + {return gen.drand(start,end);} + + }; + + template + class Random_XorShift64_Pool; + + template + class Random_XorShift64 { + private: + uint64_t state_; + const int state_idx_; + friend class Random_XorShift64_Pool; + public: + + typedef DeviceType device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffff/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffLL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift64 (uint64_t state, int state_idx = 0) + : state_(state),state_idx_(state_idx){} + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + + uint64_t tmp = state_ * 2685821657736338717ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + return (state_ * 2685821657736338717ULL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + tmp = urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + tmp = urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + tmp = rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + tmp = rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return drand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + + }; + + template + class Random_XorShift64_Pool { + private: + typedef View lock_type; + typedef View state_data_type; + lock_type locks_; + state_data_type state_; + int num_states_; + + public: + typedef Random_XorShift64 generator_type; + typedef DeviceType device_type; + + Random_XorShift64_Pool() { + num_states_ = 0; + } + Random_XorShift64_Pool(unsigned int seed) { + num_states_ = 0; + init(seed,DeviceType::max_hardware_threads()); + } + + void init(unsigned int seed, int num_states) { + num_states_ = num_states; + + locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_); + state_ = state_data_type("Kokkos::Random_XorShift64::state",num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename lock_type::HostMirror h_lock = create_mirror_view(locks_); + + Random_XorShift64 gen(seed,0); + for(int i = 0; i < 17; i++) + gen.rand(); + for(int i = 0; i < num_states_; i++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i) = (((static_cast(n1)) & 0xffff)<<00) | + (((static_cast(n2)) & 0xffff)<<16) | + (((static_cast(n3)) & 0xffff)<<32) | + (((static_cast(n4)) & 0xffff)<<48); + h_lock(i) = 0; + } + deep_copy(state_,h_state); + deep_copy(locks_,h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift64 get_state() const { + const int i = DeviceType::hardware_thread_id();; + return Random_XorShift64(state_(i),i); + } + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift64& state) const { + state_(state.state_idx_) = state.state_; + } + }; + + + template + class Random_XorShift1024_Pool; + + template + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t state_[16]; + friend class Random_XorShift1024_Pool; + public: + + typedef DeviceType device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx){ + for(int i=0 ; i<16; i++) + state_[i] = state[i]; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + tmp = urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + tmp = urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + tmp = rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + tmp = rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + + + template + class Random_XorShift1024_Pool { + private: + typedef View int_view_type; + typedef View state_data_type; + + int_view_type locks_; + state_data_type state_; + int_view_type p_; + int num_states_; + + public: + typedef Random_XorShift1024 generator_type; + + typedef DeviceType device_type; + + Random_XorShift1024_Pool() { + num_states_ = 0; + } + + inline + Random_XorShift1024_Pool(unsigned int seed){ + num_states_ = 0; + init(seed,DeviceType::max_hardware_threads()); + } + + inline + void init(unsigned int seed, int num_states) { + num_states_ = num_states; + + locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_); + state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_); + p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_); + + typename state_data_type::HostMirror h_state = create_mirror_view(state_); + typename int_view_type::HostMirror h_lock = create_mirror_view(locks_); + typename int_view_type::HostMirror h_p = create_mirror_view(p_); + Random_XorShift64 gen(seed,0); + for(int i = 0; i < 17; i++) + gen.rand(); + for(int i = 0; i < num_states_; i++) { + for(int j = 0; j < 16 ; j++) { + int n1 = gen.rand(); + int n2 = gen.rand(); + int n3 = gen.rand(); + int n4 = gen.rand(); + h_state(i,j) = (((static_cast(n1)) & 0xffff)<<00) | + (((static_cast(n2)) & 0xffff)<<16) | + (((static_cast(n3)) & 0xffff)<<32) | + (((static_cast(n4)) & 0xffff)<<48); + } + h_p(i) = 0; + h_lock(i) = 0; + } + deep_copy(state_,h_state); + deep_copy(locks_,h_lock); + } + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 get_state() const { + const int i = DeviceType::hardware_thread_id(); + return Random_XorShift1024(&state_(i,0),p_(i),i); + }; + + KOKKOS_INLINE_FUNCTION + void free_state(const Random_XorShift1024& state) const { + for(int i = 0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + p_(state.state_idx_) = state.p_; + } + }; + +#if defined(KOKKOS_HAVE_CUDA) && defined(__CUDACC__) + + template<> + class Random_XorShift1024 { + private: + int p_; + const int state_idx_; + uint64_t* state_; + friend class Random_XorShift1024_Pool; + public: + + typedef Kokkos::Cuda device_type; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + enum {MAX_RAND = static_cast(0xffffffffU/2)}; + enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)}; + + KOKKOS_INLINE_FUNCTION + Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): + p_(p),state_idx_(state_idx),state_(state){ + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64() { + uint64_t state_0 = state_[ p_ ]; + uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; + state_1 ^= state_1 << 31; + state_1 ^= state_1 >> 11; + state_0 ^= state_0 >> 30; + return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& range) { + const uint32_t max_val = (MAX_URAND/range)*range; + uint32_t tmp = urand(); + while(tmp>=max_val) + urand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint32_t urand(const uint32_t& start, const uint32_t& end ) { + return urand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& range) { + const uint64_t max_val = (MAX_URAND64/range)*range; + uint64_t tmp = urand64(); + while(tmp>=max_val) + urand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + uint64_t urand64(const uint64_t& start, const uint64_t& end ) { + return urand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int rand() { + return static_cast(urand()/2); + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& range) { + const int max_val = (MAX_RAND/range)*range; + int tmp = rand(); + while(tmp>=max_val) + rand(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int rand(const int& start, const int& end ) { + return rand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64() { + return static_cast(urand64()/2); + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& range) { + const int64_t max_val = (MAX_RAND64/range)*range; + int64_t tmp = rand64(); + while(tmp>=max_val) + rand64(); + return tmp%range; + } + + KOKKOS_INLINE_FUNCTION + int64_t rand64(const int64_t& start, const int64_t& end ) { + return rand64(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + float frand() { + return 1.0f * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + float frand(const float& start, const float& end ) { + return frand(end-start)+start; + } + + KOKKOS_INLINE_FUNCTION + double drand() { + return 1.0 * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& range) { + return range * urand64()/MAX_URAND64; + } + + KOKKOS_INLINE_FUNCTION + double drand(const double& start, const double& end ) { + return frand(end-start)+start; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + KOKKOS_INLINE_FUNCTION + double normal() { + double S = 2.0; + double U; + while(S>=1.0) { + U = drand(); + const double V = drand(); + S = U*U+V*V; + } + return U*sqrt(-2.0*log(S)/S); + } + + KOKKOS_INLINE_FUNCTION + double normal(const double& mean, const double& std_dev=1.0) { + return mean + normal()*std_dev; + } + }; + +template<> +inline +Random_XorShift64_Pool::Random_XorShift64_Pool(unsigned int seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift64 Random_XorShift64_Pool::get_state() const { +#ifdef __CUDA_ARCH__ + const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z; + int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) * + blockDim.x*blockDim.y*blockDim.z + i_offset; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim.x*blockDim.y*blockDim.z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift64(state_(i),i); +#else + return Random_XorShift64(state_(0),0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift64_Pool::free_state(const Random_XorShift64 &state) const { +#ifdef __CUDA_ARCH__ + state_(state.state_idx_) = state.state_; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +template<> +inline +Random_XorShift1024_Pool::Random_XorShift1024_Pool(unsigned int seed) { + num_states_ = 0; + init(seed,4*32768); +} + +template<> +KOKKOS_INLINE_FUNCTION +Random_XorShift1024 Random_XorShift1024_Pool::get_state() const { +#ifdef __CUDA_ARCH__ + const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z; + int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) * + blockDim.x*blockDim.y*blockDim.z + i_offset; + while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) { + i+=blockDim.x*blockDim.y*blockDim.z; + if(i>=num_states_) {i = i_offset;} + } + + return Random_XorShift1024(&state_(i,0), p_(i), i); +#else + return Random_XorShift1024(&state_(0,0), p_(0), 0); +#endif +} + +template<> +KOKKOS_INLINE_FUNCTION +void Random_XorShift1024_Pool::free_state(const Random_XorShift1024 &state) const { +#ifdef __CUDA_ARCH__ + for(int i=0; i<16; i++) + state_(state.state_idx_,i) = state.state_[i]; + locks_(state.state_idx_) = 0; + return; +#endif +} + + +#endif + + + +template +struct fill_random_functor_range; +template +struct fill_random_functor_begin_end; + +template +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_range{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type range; + + typedef rand Rand; + + fill_random_functor_range(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type range_): + a(a_),rand_pool(rand_pool_),range(range_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +struct fill_random_functor_begin_end{ + typedef typename ViewType::device_type device_type; + ViewType a; + RandomPool rand_pool; + typename ViewType::const_value_type begin,end; + + typedef rand Rand; + + fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_, + typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_): + a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} + + KOKKOS_INLINE_FUNCTION + void operator() (unsigned int i) const { + typename RandomPool::generator_type gen = rand_pool.get_state(); + for(unsigned int j=0;j +void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { + int64_t LDA = a.dimension_0(); + if(LDA>0) + parallel_for((LDA+127)/128,fill_random_functor_range(a,g,range)); +} + +template +void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) { + int64_t LDA = a.dimension_0(); + if(LDA>0) + parallel_for((LDA+127)/128,fill_random_functor_begin_end(a,g,begin,end)); +} +} + +#endif