Update to Kokkos r2.04.04 and add workaround for performance regression

This commit is contained in:
Stan Moore
2017-10-03 11:30:00 -06:00
parent ee487ef4aa
commit a55adf4a68
26 changed files with 1222 additions and 562 deletions

View File

@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
}
#endif
#if defined(KOKKOS_ENABLE_ROCM)
template<>
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
public:
typedef Kokkos::Experimental::ROCm device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND/range)*range;
uint32_t tmp = urand();
while(tmp>=max_val)
urand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
return urand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64/range)*range;
uint64_t tmp = urand64();
while(tmp>=max_val)
urand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
return urand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int rand() {
return static_cast<int>(urand()/2);
}
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND/range)*range;
int tmp = rand();
while(tmp>=max_val)
rand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end ) {
return rand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() {
return static_cast<int64_t>(urand64()/2);
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64/range)*range;
int64_t tmp = rand64();
while(tmp>=max_val)
rand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end ) {
return rand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
float frand() {
return 1.0f * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end ) {
return frand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
double drand() {
return 1.0 * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end ) {
return frand(end-start)+start;
}
//Marsaglia polar method for drawing a standard normal distributed random number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while(S>=1.0) {
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*std::sqrt(-2.0*log(S)/S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0) {
return mean + normal()*std_dev;
}
};
template<>
inline
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
#else
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
return;
#endif
}
template<>
inline
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
for(int i=0; i<16; i++)
state_(state.state_idx_,i) = state.state_[i];
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif