Update to Kokkos r2.04.04 and add workaround for performance regression

This commit is contained in:
Stan Moore
2017-10-03 11:30:00 -06:00
parent ee487ef4aa
commit a55adf4a68
26 changed files with 1222 additions and 562 deletions

View File

@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
}
#endif
#if defined(KOKKOS_ENABLE_ROCM)
template<>
class Random_XorShift1024<Kokkos::Experimental::ROCm> {
private:
int p_;
const int state_idx_;
uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
public:
typedef Kokkos::Experimental::ROCm device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND/range)*range;
uint32_t tmp = urand();
while(tmp>=max_val)
urand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end ) {
return urand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64/range)*range;
uint64_t tmp = urand64();
while(tmp>=max_val)
urand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
return urand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int rand() {
return static_cast<int>(urand()/2);
}
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND/range)*range;
int tmp = rand();
while(tmp>=max_val)
rand();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end ) {
return rand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() {
return static_cast<int64_t>(urand64()/2);
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64/range)*range;
int64_t tmp = rand64();
while(tmp>=max_val)
rand64();
return tmp%range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end ) {
return rand64(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
float frand() {
return 1.0f * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end ) {
return frand(end-start)+start;
}
KOKKOS_INLINE_FUNCTION
double drand() {
return 1.0 * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64()/MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end ) {
return frand(end-start)+start;
}
//Marsaglia polar method for drawing a standard normal distributed random number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while(S>=1.0) {
U = 2.0*drand() - 1.0;
const double V = 2.0*drand() - 1.0;
S = U*U+V*V;
}
return U*std::sqrt(-2.0*log(S)/S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0) {
return mean + normal()*std_dev;
}
};
template<>
inline
Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
#else
return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
state_(state.state_idx_) = state.state_;
locks_(state.state_idx_) = 0;
return;
#endif
}
template<>
inline
Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed,4*32768);
}
template<>
KOKKOS_INLINE_FUNCTION
Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
#ifdef __HCC_ACCELERATOR__
const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
i+=blockDim_x*blockDim_y*blockDim_z;
if(i>=num_states_) {i = i_offset;}
}
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
#else
return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
#endif
}
template<>
KOKKOS_INLINE_FUNCTION
void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
#ifdef __HCC_ACCELERATOR__
for(int i=0; i<16; i++)
state_(state.state_idx_,i) = state.state_[i];
locks_(state.state_idx_) = 0;
return;
#endif
}
#endif

View File

@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
TEST_TARGETS += test-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_ROCm
TEST_TARGETS += test-rocm
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
TARGETS += KokkosAlgorithms_UnitTest_Threads
@ -51,6 +57,9 @@ endif
KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosAlgorithms_UnitTest_Cuda
./KokkosAlgorithms_UnitTest_Cuda
test-rocm: KokkosAlgorithms_UnitTest_ROCm
./KokkosAlgorithms_UnitTest_ROCm
test-threads: KokkosAlgorithms_UnitTest_Threads
./KokkosAlgorithms_UnitTest_Threads

View File

@ -0,0 +1,112 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_ROCM
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <TestRandom.hpp>
#include <TestSort.hpp>
namespace Test {
class rocm : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
}
static void TearDownTestCase()
{
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
void rocm_test_random_xorshift64( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
void rocm_test_random_xorshift1024( int num_draws )
{
Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
}
#define ROCM_RANDOM_XORSHIFT64( num_draws ) \
TEST_F( rocm, Random_XorShift64 ) { \
rocm_test_random_xorshift64(num_draws); \
}
#define ROCM_RANDOM_XORSHIFT1024( num_draws ) \
TEST_F( rocm, Random_XorShift1024 ) { \
rocm_test_random_xorshift1024(num_draws); \
}
#define ROCM_SORT_UNSIGNED( size ) \
TEST_F( rocm, SortUnsigned ) { \
Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size); \
}
ROCM_RANDOM_XORSHIFT64( 132141141 )
ROCM_RANDOM_XORSHIFT1024( 52428813 )
ROCM_SORT_UNSIGNED(171)
#undef ROCM_RANDOM_XORSHIFT64
#undef ROCM_RANDOM_XORSHIFT1024
#undef ROCM_SORT_UNSIGNED
}
#else
void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
#endif /* #ifdef KOKKOS_ENABLE_ROCM */