Update Kokkos library to v2.04.00
This commit is contained in:
@ -9,30 +9,6 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
|
||||
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
|
||||
|
||||
# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
|
||||
# use of lambdas at the outer level of parallel dispatch (that is, as
|
||||
# the argument to an outer parallel_for, parallel_reduce, or
|
||||
# parallel_scan). This works with non-CUDA execution spaces if C++11
|
||||
# is enabled. It does not currently work with public releases of
|
||||
# CUDA. If that changes, please change the default here to ON if CUDA
|
||||
# and C++11 are ON.
|
||||
IF (${PROJECT_NAME}_ENABLE_CXX11)
|
||||
IF (${PACKAGE_NAME}_ENABLE_CUDA)
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
|
||||
ELSE ()
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
|
||||
ENDIF ()
|
||||
ELSE ()
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
|
||||
ENDIF ()
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
"Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
|
||||
${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
|
||||
)
|
||||
|
||||
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
@ -152,10 +152,10 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION pointer data() { return pointer(0) ; }
|
||||
KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
|
||||
|
||||
~Array() = default ;
|
||||
Array() = default ;
|
||||
Array( const Array & ) = default ;
|
||||
Array & operator = ( const Array & ) = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED Array() = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED Array( const Array & ) = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED Array & operator = ( const Array & ) = default ;
|
||||
|
||||
// Some supported compilers are not sufficiently C++11 compliant
|
||||
// for default move constructor and move assignment operator.
|
||||
@ -209,7 +209,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
|
||||
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
|
||||
|
||||
~Array() = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
|
||||
Array() = delete ;
|
||||
Array( const Array & rhs ) = delete ;
|
||||
|
||||
@ -278,7 +278,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
|
||||
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
|
||||
|
||||
~Array() = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
|
||||
Array() = delete ;
|
||||
Array( const Array & ) = delete ;
|
||||
|
||||
|
||||
@ -80,6 +80,11 @@
|
||||
// Compiling NVIDIA device code, must use Cuda atomics:
|
||||
|
||||
#define KOKKOS_ENABLE_CUDA_ATOMICS
|
||||
|
||||
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU)
|
||||
|
||||
#define KOKKOS_ENABLE_ROCM_ATOMICS
|
||||
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
|
||||
@ -154,6 +159,19 @@ const char * atomic_query_version()
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
#include <ROCm/Kokkos_ROCm_Atomic.hpp>
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
extern KOKKOS_INLINE_FUNCTION
|
||||
bool lock_address_rocm_space(void* ptr);
|
||||
|
||||
extern KOKKOS_INLINE_FUNCTION
|
||||
void unlock_address_rocm_space(void* ptr);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef _WIN32
|
||||
#include "impl/Kokkos_Atomic_Windows.hpp"
|
||||
#else
|
||||
|
||||
@ -107,6 +107,11 @@ public:
|
||||
re_ (val), im_ (0.0)
|
||||
{}
|
||||
|
||||
// BUG HCC WORKAROUND
|
||||
KOKKOS_INLINE_FUNCTION complex( const RealType& re, const RealType& im):
|
||||
re_ (re), im_ (im)
|
||||
{}
|
||||
|
||||
//! Constructor that takes the real and imaginary parts.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
|
||||
@ -227,6 +232,16 @@ public:
|
||||
return re_;
|
||||
}
|
||||
|
||||
//! Set the imaginary part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION void imag (RealType v) {
|
||||
im_ = v;
|
||||
}
|
||||
|
||||
//! Set the real part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION void real (RealType v) {
|
||||
re_ = v;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const complex<RealType>& src) {
|
||||
re_ += src.re_;
|
||||
@ -299,7 +314,7 @@ public:
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
|
||||
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
@ -537,7 +552,7 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
|
||||
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
|
||||
@ -74,6 +74,10 @@
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
#include <Kokkos_ROCm.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Pair.hpp>
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <Kokkos_Array.hpp>
|
||||
|
||||
@ -122,6 +122,13 @@ class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU
|
||||
class Cuda; ///< Execution space for Cuda GPU
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
namespace Experimental {
|
||||
class ROCmSpace ; ///< Memory space on ROCm GPU
|
||||
class ROCm ; ///< Execution space for ROCm GPU
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class ExecutionSpace, class MemorySpace>
|
||||
struct Device;
|
||||
|
||||
@ -140,6 +147,8 @@ namespace Kokkos {
|
||||
typedef Cuda DefaultExecutionSpace;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
|
||||
typedef Experimental::ROCm DefaultExecutionSpace ;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
@ -185,6 +194,8 @@ namespace Impl {
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace;
|
||||
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU )
|
||||
typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
|
||||
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
typedef Kokkos::HostSpace ActiveExecutionMemorySpace;
|
||||
#else
|
||||
|
||||
@ -98,18 +98,18 @@ public:
|
||||
typedef View<size_type* , array_layout, device_type> row_map_type;
|
||||
typedef View<DataType* , array_layout, device_type> entries_type;
|
||||
|
||||
entries_type entries;
|
||||
row_map_type row_map;
|
||||
entries_type entries;
|
||||
|
||||
//! Construct an empty view.
|
||||
Crs () : entries(), row_map() {}
|
||||
Crs() : row_map(), entries() {}
|
||||
|
||||
//! Copy constructor (shallow copy).
|
||||
Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
|
||||
Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
|
||||
{}
|
||||
|
||||
template<class EntriesType, class RowMapType>
|
||||
Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
|
||||
Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
|
||||
{}
|
||||
|
||||
/** \brief Assign to a view of the rhs array.
|
||||
@ -117,8 +117,8 @@ public:
|
||||
* then allocated memory is deallocated.
|
||||
*/
|
||||
Crs& operator= (const Crs& rhs) {
|
||||
entries = rhs.entries;
|
||||
row_map = rhs.row_map;
|
||||
entries = rhs.entries;
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -151,7 +151,7 @@ void get_crs_transpose_counts(
|
||||
|
||||
template< class OutCounts,
|
||||
class InCrs>
|
||||
void get_crs_row_map_from_counts(
|
||||
typename OutCounts::value_type get_crs_row_map_from_counts(
|
||||
OutCounts& out,
|
||||
InCrs const& in,
|
||||
std::string const& name = "row_map");
|
||||
@ -204,18 +204,20 @@ class CrsRowMapFromCounts {
|
||||
using execution_space = typename InCounts::execution_space;
|
||||
using value_type = typename OutRowMap::value_type;
|
||||
using index_type = typename InCounts::size_type;
|
||||
using last_value_type = Kokkos::View<value_type, execution_space>;
|
||||
private:
|
||||
InCounts in;
|
||||
OutRowMap out;
|
||||
InCounts m_in;
|
||||
OutRowMap m_out;
|
||||
last_value_type m_last_value;
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(index_type i, value_type& update, bool final_pass) const {
|
||||
update += in(i);
|
||||
if (final_pass) {
|
||||
out(i + 1) = update;
|
||||
if (i == 0) {
|
||||
out(0) = 0;
|
||||
}
|
||||
if (i < m_in.size()) {
|
||||
update += m_in(i);
|
||||
if (final_pass) m_out(i + 1) = update;
|
||||
} else if (final_pass) {
|
||||
m_out(0) = 0;
|
||||
m_last_value() = update;
|
||||
}
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -226,12 +228,16 @@ class CrsRowMapFromCounts {
|
||||
}
|
||||
using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
|
||||
CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
|
||||
in(arg_in),out(arg_out) {
|
||||
m_in(arg_in), m_out(arg_out), m_last_value("last_value") {
|
||||
}
|
||||
value_type execute() {
|
||||
using policy_type = RangePolicy<index_type, execution_space>;
|
||||
using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
|
||||
closure_type closure(*this, policy_type(0, in.size()));
|
||||
closure_type closure(*this, policy_type(0, m_in.size() + 1));
|
||||
closure.execute();
|
||||
execution_space::fence();
|
||||
auto last_value = Kokkos::create_mirror_view(m_last_value);
|
||||
Kokkos::deep_copy(last_value, m_last_value);
|
||||
return last_value();
|
||||
}
|
||||
};
|
||||
|
||||
@ -297,13 +303,14 @@ void get_crs_transpose_counts(
|
||||
|
||||
template< class OutRowMap,
|
||||
class InCounts>
|
||||
void get_crs_row_map_from_counts(
|
||||
typename OutRowMap::value_type get_crs_row_map_from_counts(
|
||||
OutRowMap& out,
|
||||
InCounts const& in,
|
||||
std::string const& name) {
|
||||
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
|
||||
Kokkos::Impl::Experimental::
|
||||
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
|
||||
return functor.execute();
|
||||
}
|
||||
|
||||
template< class DataType,
|
||||
@ -328,6 +335,65 @@ void transpose_crs(
|
||||
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
|
||||
}
|
||||
|
||||
template< class CrsType,
|
||||
class Functor>
|
||||
struct CountAndFill {
|
||||
using data_type = typename CrsType::size_type;
|
||||
using size_type = typename CrsType::size_type;
|
||||
using row_map_type = typename CrsType::row_map_type;
|
||||
using entries_type = typename CrsType::entries_type;
|
||||
using counts_type = row_map_type;
|
||||
CrsType m_crs;
|
||||
Functor m_functor;
|
||||
counts_type m_counts;
|
||||
struct Count {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
|
||||
m_counts(i) = m_functor(i, nullptr);
|
||||
}
|
||||
struct Fill {};
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
|
||||
auto j = m_crs.row_map(i);
|
||||
data_type* fill = &(m_crs.entries(j));
|
||||
m_functor(i, fill);
|
||||
}
|
||||
using self_type = CountAndFill<CrsType, Functor>;
|
||||
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
|
||||
m_crs(crs),
|
||||
m_functor(f)
|
||||
{
|
||||
using execution_space = typename CrsType::execution_space;
|
||||
m_counts = counts_type("counts", nrows);
|
||||
{
|
||||
using count_policy_type = RangePolicy<size_type, execution_space, Count>;
|
||||
using count_closure_type =
|
||||
Kokkos::Impl::ParallelFor<self_type, count_policy_type>;
|
||||
const count_closure_type closure(*this, count_policy_type(0, nrows));
|
||||
closure.execute();
|
||||
}
|
||||
auto nentries = Kokkos::Experimental::
|
||||
get_crs_row_map_from_counts(m_crs.row_map, m_counts);
|
||||
m_counts = counts_type();
|
||||
m_crs.entries = entries_type("entries", nentries);
|
||||
{
|
||||
using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
|
||||
using fill_closure_type =
|
||||
Kokkos::Impl::ParallelFor<self_type, fill_policy_type>;
|
||||
const fill_closure_type closure(*this, fill_policy_type(0, nrows));
|
||||
closure.execute();
|
||||
}
|
||||
crs = m_crs;
|
||||
}
|
||||
};
|
||||
|
||||
template< class CrsType,
|
||||
class Functor>
|
||||
void count_and_fill_crs(
|
||||
CrsType& crs,
|
||||
typename CrsType::size_type nrows,
|
||||
Functor const& f) {
|
||||
Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif /* #define KOKKOS_CRS_HPP */
|
||||
|
||||
@ -96,6 +96,14 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_THREADS) || \
|
||||
defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_QTHREADS) || \
|
||||
defined(KOKKOS_ENABLE_ROCM) || defined(KOKKOS_ENABLE_OPENMPTARGET)
|
||||
#define KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND
|
||||
#endif
|
||||
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
// Compiling with a CUDA compiler.
|
||||
//
|
||||
@ -133,6 +141,9 @@
|
||||
|
||||
#if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
|
||||
#define KOKKOS_LAMBDA [=]__device__
|
||||
#if defined( KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND )
|
||||
#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_LAMBDA [=]__host__ __device__
|
||||
|
||||
@ -141,16 +152,13 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
// Cuda version 8.0 still needs the functor wrapper
|
||||
#if /* ( CUDA_VERSION < 8000 ) && */ defined( __NVCC__ )
|
||||
#if defined( __NVCC__ )
|
||||
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
|
||||
#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
|
||||
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Language info: C++, CUDA, OPENMP
|
||||
@ -161,8 +169,20 @@
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
|
||||
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
|
||||
#define KOKKOS_FUNCTION __device__ __host__
|
||||
#ifdef KOKKOS_COMPILER_CLANG
|
||||
#define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
|
||||
#endif
|
||||
#endif // #if defined( __CUDA_ARCH__ )
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM ) && defined( __HCC__ )
|
||||
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION __attribute__((amp,cpu)) inline
|
||||
#define KOKKOS_INLINE_FUNCTION __attribute__((amp,cpu)) inline
|
||||
#define KOKKOS_FUNCTION __attribute__((amp,cpu))
|
||||
#define KOKKOS_LAMBDA [=] __attribute__((amp,cpu))
|
||||
#define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
|
||||
#endif
|
||||
|
||||
#if defined( _OPENMP )
|
||||
// Compiling with OpenMP.
|
||||
// The value of _OPENMP is an integer value YYYYMM
|
||||
@ -179,15 +199,6 @@
|
||||
// Host code is compiled again with another compiler.
|
||||
// Device code is compile to 'ptx'.
|
||||
#define KOKKOS_COMPILER_NVCC __NVCC__
|
||||
#else
|
||||
#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
#if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
|
||||
// CUDA (including version 6.5) does not support giving lambdas as
|
||||
// arguments to global functions. Thus its not currently possible
|
||||
// to dispatch lambdas from the host.
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif
|
||||
#endif // #if defined( __NVCC__ )
|
||||
|
||||
#if !defined( KOKKOS_LAMBDA )
|
||||
@ -321,6 +332,10 @@
|
||||
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_ASM )
|
||||
#define KOKKOS_ENABLE_ASM 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -397,6 +412,10 @@
|
||||
#define KOKKOS_FUNCTION /**/
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_FUNCTION_DEFAULTED )
|
||||
#define KOKKOS_FUNCTION_DEFAULTED /**/
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Define empty macro for restrict if necessary:
|
||||
|
||||
@ -424,6 +443,7 @@
|
||||
// There is zero or one default execution space specified.
|
||||
|
||||
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
|
||||
@ -435,6 +455,7 @@
|
||||
// If default is not specified then chose from enabled execution spaces.
|
||||
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
@ -442,6 +463,8 @@
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
#elif defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
#elif defined( KOKKOS_ENABLE_ROCM )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM
|
||||
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
@ -459,6 +482,8 @@
|
||||
|
||||
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
#elif defined( __HCC__ ) && defined( __HCC_ACCELERATOR__ ) && defined( KOKKOS_ENABLE_ROCM )
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU
|
||||
#else
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
#endif
|
||||
|
||||
@ -233,12 +233,24 @@ public:
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
MemoryPool() = default ;
|
||||
MemoryPool( MemoryPool && ) = default ;
|
||||
MemoryPool( const MemoryPool & ) = default ;
|
||||
MemoryPool & operator = ( MemoryPool && ) = default ;
|
||||
MemoryPool & operator = ( const MemoryPool & ) = default ;
|
||||
|
||||
MemoryPool()
|
||||
: m_tracker()
|
||||
, m_sb_state_array(0)
|
||||
, m_sb_state_size(0)
|
||||
, m_sb_size_lg2(0)
|
||||
, m_max_block_size_lg2(0)
|
||||
, m_min_block_size_lg2(0)
|
||||
, m_sb_count(0)
|
||||
, m_hint_offset(0)
|
||||
, m_data_offset(0)
|
||||
, m_unused_padding(0)
|
||||
{}
|
||||
|
||||
/**\brief Allocate a memory pool from 'memspace'.
|
||||
*
|
||||
* The memory pool will have at least 'min_total_alloc_size' bytes
|
||||
|
||||
@ -1016,7 +1016,7 @@ parallel_reduce( std::string const & arg_label
|
||||
|
||||
//------------------------------
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelReduce(arg_label, 0, &kpID);
|
||||
@ -1042,7 +1042,7 @@ parallel_reduce( std::string const & arg_label
|
||||
|
||||
//------------------------------
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelReduce(kpID);
|
||||
}
|
||||
|
||||
220
lib/kokkos/core/src/Kokkos_ROCm.hpp
Normal file
220
lib/kokkos/core/src/Kokkos_ROCm.hpp
Normal file
@ -0,0 +1,220 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_ROCM_HPP
|
||||
#define KOKKOS_ROCM_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
#include <ROCm/hc_math_std.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_ROCmSpace.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Exec.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <hc.hpp>
|
||||
#include <hc_am.hpp>
|
||||
#include <amp_math.h>
|
||||
|
||||
#if defined( __HCC_ACCELERATOR__ )
|
||||
|
||||
using namespace ::Concurrency::precise_math ;
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
class ROCmExec ;
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/// \class ROCm
|
||||
/// \brief Kokkos device for multicore processors in the host memory space.
|
||||
class ROCm {
|
||||
public:
|
||||
//------------------------------------
|
||||
//! \name Type declarations that all Kokkos devices must provide.
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef ROCm execution_space ;
|
||||
typedef ROCmSpace memory_space ;
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
typedef LayoutLeft array_layout ;
|
||||
typedef HostSpace::size_type size_type ;
|
||||
|
||||
typedef ScratchMemorySpace< ROCm > scratch_memory_space ;
|
||||
|
||||
~ROCm() {}
|
||||
ROCm();
|
||||
// explicit ROCm( const int instance_id );
|
||||
|
||||
ROCm( ROCm && ) = default ;
|
||||
ROCm( const ROCm & ) = default ;
|
||||
ROCm & operator = ( ROCm && ) = default ;
|
||||
ROCm & operator = ( const ROCm & ) = default ;
|
||||
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos devices must implement.
|
||||
//@{
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static int in_parallel() {
|
||||
#if defined( __HCC_ACCELERATOR__ )
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Set the device in a "sleep" state. */
|
||||
static bool sleep() ;
|
||||
|
||||
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
|
||||
static bool wake() ;
|
||||
|
||||
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
|
||||
static void fence() ;
|
||||
|
||||
/// \brief Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool detail = false );
|
||||
|
||||
/// \brief Free any resources being consumed by the device.
|
||||
static void finalize() ;
|
||||
|
||||
/** \brief Initialize the device.
|
||||
*
|
||||
*/
|
||||
struct SelectDevice {
|
||||
int rocm_device_id ;
|
||||
SelectDevice() : rocm_device_id(1) {}
|
||||
explicit SelectDevice( int id ) : rocm_device_id( id+1 ) {}
|
||||
};
|
||||
|
||||
int rocm_device() const { return m_device ; }
|
||||
bool isAPU();
|
||||
bool isAPU(int device);
|
||||
|
||||
static void initialize( const SelectDevice = SelectDevice());
|
||||
|
||||
static int is_initialized();
|
||||
|
||||
// static size_type device_arch();
|
||||
|
||||
// static size_type detect_device_count();
|
||||
|
||||
|
||||
static int concurrency() ;
|
||||
static const char* name();
|
||||
private:
|
||||
int m_device ;
|
||||
|
||||
};
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::Experimental::ROCmSpace
|
||||
, Kokkos::Experimental::ROCm::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = false };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Experimental::ROCm::memory_space
|
||||
, Kokkos::Experimental::ROCm::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { value = true };
|
||||
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
|
||||
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::HostSpace
|
||||
, Kokkos::Experimental::ROCm::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { value = false };
|
||||
inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
|
||||
inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Task.hpp>
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
622
lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
Normal file
622
lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
Normal file
@ -0,0 +1,622 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_ROCMSPACE_HPP
|
||||
#define KOKKOS_ROCMSPACE_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
|
||||
#include <iosfwd>
|
||||
#include <typeinfo>
|
||||
#include <string>
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/** \brief ROCm on-device memory management */
|
||||
|
||||
class ROCmSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef ROCmSpace memory_space ;
|
||||
typedef Kokkos::Experimental::ROCm execution_space ;
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
typedef unsigned int size_type ;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
ROCmSpace();
|
||||
ROCmSpace( ROCmSpace && rhs ) = default ;
|
||||
ROCmSpace( const ROCmSpace & rhs ) = default ;
|
||||
ROCmSpace & operator = ( ROCmSpace && rhs ) = default ;
|
||||
ROCmSpace & operator = ( const ROCmSpace & rhs ) = default ;
|
||||
~ROCmSpace() = default ;
|
||||
|
||||
/**\brief Allocate untracked memory in the rocm space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate untracked memory in the rocm space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name() { return m_name; };
|
||||
|
||||
/*--------------------------------*/
|
||||
/** \brief Error reporting for HostSpace attempt to access ROCmSpace */
|
||||
static void access_error();
|
||||
static void access_error( const void * const );
|
||||
|
||||
private:
|
||||
|
||||
int m_device ; ///< Which ROCm device
|
||||
|
||||
static constexpr const char* m_name = "ROCm";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
|
||||
namespace Impl {
|
||||
|
||||
void * rocm_device_allocate(int);
|
||||
void * rocm_hostpinned_allocate(int);
|
||||
void rocm_device_free(void * );
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
///
|
||||
/// Arbitrary atomics are implemented using a hash table of locks
|
||||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function initializes the locks to zero (unset).
|
||||
void init_lock_arrays_rocm_space();
|
||||
|
||||
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
|
||||
///
|
||||
/// Arbitrary atomics are implemented using a hash table of locks
|
||||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
|
||||
|
||||
/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
|
||||
///
|
||||
/// Team and Thread private scratch allocations in
|
||||
/// global memory are aquired via locks.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
|
||||
|
||||
/// \brief Retrieve the pointer to the scratch array for unique identifiers.
|
||||
///
|
||||
/// Unique identifiers in the range 0-ROCm::concurrency
|
||||
/// are provided via locks.
|
||||
/// This function retrieves the lock array pointer.
|
||||
/// If the array is not yet allocated it will do so.
|
||||
int* threadid_lock_array_rocm_space_ptr(bool deallocate = false);
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/** \brief Host memory that is accessible to ROCm execution space
|
||||
* through ROCm's host-pinned memory allocation.
|
||||
*/
|
||||
class ROCmHostPinnedSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
/** \brief Memory is in HostSpace so use the HostSpace::execution_space */
|
||||
typedef HostSpace::execution_space execution_space ;
|
||||
typedef ROCmHostPinnedSpace memory_space ;
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
typedef unsigned int size_type ;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
ROCmHostPinnedSpace();
|
||||
ROCmHostPinnedSpace( ROCmHostPinnedSpace && rhs ) = default ;
|
||||
ROCmHostPinnedSpace( const ROCmHostPinnedSpace & rhs ) = default ;
|
||||
ROCmHostPinnedSpace & operator = ( ROCmHostPinnedSpace && rhs ) = default ;
|
||||
ROCmHostPinnedSpace & operator = ( const ROCmHostPinnedSpace & rhs ) = default ;
|
||||
~ROCmHostPinnedSpace() = default ;
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name() { return m_name; };
|
||||
|
||||
private:
|
||||
|
||||
static constexpr const char* m_name = "ROCmHostPinned";
|
||||
|
||||
/*--------------------------------*/
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace >::assignable , "" );
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
|
||||
// HostSpace::execution_space == ROCmHostPinnedSpace::execution_space
|
||||
enum { assignable = true };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
|
||||
// ROCmSpace::execution_space != ROCmHostPinnedSpace::execution_space
|
||||
enum { assignable = false };
|
||||
enum { accessible = true }; // ROCmSpace::execution_space
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
|
||||
//----------------------------------------
|
||||
// ROCmHostPinnedSpace::execution_space == HostSpace::execution_space
|
||||
// ROCmHostPinnedSpace accessible to both ROCm and Host
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::HostSpace > {
|
||||
enum { assignable = false }; // Cannot access from ROCm
|
||||
enum { accessible = true }; // ROCmHostPinnedSpace::execution_space
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace > {
|
||||
enum { assignable = false }; // Cannot access from Host
|
||||
enum { accessible = false };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
};
|
||||
//----------------------------------------
|
||||
|
||||
} // namespace Kokkos::Impl
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n);
|
||||
|
||||
template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm>
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
|
||||
fut.wait();
|
||||
// DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm>
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm >
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >
|
||||
{
|
||||
DeepCopy( void * dst , const void * src , size_t );
|
||||
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace>
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
|
||||
fut.wait();
|
||||
// DeepCopyROCm (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
|
||||
fut.wait();
|
||||
// DeepCopyROCm (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
// hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
|
||||
// fut.wait();
|
||||
// DeepCopyAsyncROCm (dst,src,n);
|
||||
DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
|
||||
{
|
||||
inline
|
||||
DeepCopy( void * dst , const void * src , size_t n )
|
||||
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
|
||||
|
||||
inline
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
|
||||
{
|
||||
exec.fence();
|
||||
DeepCopy (dst,src,n);
|
||||
}
|
||||
};
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/** Running in ROCmSpace attempting to access HostSpace: error */
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace >
|
||||
{
|
||||
enum { value = false };
|
||||
KOKKOS_INLINE_FUNCTION static void verify( void )
|
||||
{ Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static void verify( const void * )
|
||||
{ Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
|
||||
};
|
||||
|
||||
/** Running in ROCmSpace accessing ROCmHostPinnedSpace: ok */
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
|
||||
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
/** Running in ROCmSpace attempting to access an unknown space: error */
|
||||
template< class OtherSpace >
|
||||
struct VerifyExecutionCanAccessMemorySpace<
|
||||
typename enable_if< ! is_same<Kokkos::Experimental::ROCmSpace,OtherSpace>::value , Kokkos::Experimental::ROCmSpace >::type ,
|
||||
OtherSpace >
|
||||
{
|
||||
enum { value = false };
|
||||
KOKKOS_INLINE_FUNCTION static void verify( void )
|
||||
{ Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static void verify( const void * )
|
||||
{ Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Running in HostSpace attempting to access ROCmSpace */
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >
|
||||
{
|
||||
enum { value = false };
|
||||
inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
|
||||
inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
|
||||
};
|
||||
|
||||
/** Running in HostSpace accessing ROCmHostPinnedSpace is OK */
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
KOKKOS_INLINE_FUNCTION static void verify( void ) {}
|
||||
KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
|
||||
};
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
const Kokkos::Experimental::ROCmSpace m_space ;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const RecordBase::function_type arg_dealloc = & deallocate
|
||||
);
|
||||
|
||||
public:
|
||||
|
||||
std::string get_label() const ;
|
||||
|
||||
static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::ROCmSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
const Kokkos::Experimental::ROCmHostPinnedSpace m_space ;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() : RecordBase(), m_space() {}
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const RecordBase::function_type arg_dealloc = & deallocate
|
||||
);
|
||||
|
||||
public:
|
||||
|
||||
std::string get_label() const ;
|
||||
|
||||
static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::ROCmHostPinnedSpace & , bool detail = false );
|
||||
};
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */
|
||||
#endif /* #define KOKKOS_ROCMSPACE_HPP */
|
||||
|
||||
@ -681,6 +681,67 @@ public:
|
||||
return f ;
|
||||
}
|
||||
|
||||
template < class F >
|
||||
KOKKOS_FUNCTION
|
||||
Future< execution_space >
|
||||
when_all( int narg , F const func )
|
||||
{
|
||||
using input_type = decltype( func(0) );
|
||||
using future_type = Future< execution_space > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< void , void , void > ;
|
||||
|
||||
static_assert( is_future< input_type >::value
|
||||
, "Functor must return a Kokkos::Future" );
|
||||
|
||||
future_type f ;
|
||||
|
||||
if ( 0 == narg ) return f ;
|
||||
|
||||
size_t const alloc_size = m_queue->when_all_allocation_size( narg );
|
||||
|
||||
f.m_task =
|
||||
reinterpret_cast< task_base * >( m_queue->allocate( alloc_size ) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
|
||||
new( f.m_task ) task_base();
|
||||
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = alloc_size ;
|
||||
f.m_task->m_dep_count = narg ;
|
||||
f.m_task->m_task_type = task_base::Aggregate ;
|
||||
|
||||
// Assign dependences, reference counts were already incremented
|
||||
|
||||
task_base * volatile * const dep =
|
||||
f.m_task->aggregate_dependences();
|
||||
|
||||
for ( int i = 0 ; i < narg ; ++i ) {
|
||||
const input_type arg_f = func(i);
|
||||
if ( 0 != arg_f.m_task ) {
|
||||
|
||||
if ( m_queue != static_cast< queue_type * >( arg_f.m_task->m_queue ) ) {
|
||||
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
|
||||
}
|
||||
// Increment reference count to track subsequent assignment.
|
||||
Kokkos::atomic_increment( &(arg_f.m_task->m_ref_count) );
|
||||
dep[i] = arg_f.m_task ;
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
m_queue->schedule_aggregate( f.m_task );
|
||||
// this when_all may be processed at any moment
|
||||
}
|
||||
return f ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -2429,6 +2429,7 @@ template < class ValueType >
|
||||
struct CommonViewAllocProp< void, ValueType >
|
||||
{
|
||||
using value_type = ValueType;
|
||||
using scalar_array_type = ValueType;
|
||||
|
||||
template < class ... Views >
|
||||
CommonViewAllocProp( const Views & ... ) {}
|
||||
|
||||
439
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
Normal file
439
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
Normal file
@ -0,0 +1,439 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <hc.hpp>
|
||||
//#include <hsa_atomic.h>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_ROCM_ATOMICS
|
||||
namespace Kokkos {
|
||||
//ROCm can do:
|
||||
//Types int/unsigned int
|
||||
//variants: atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_exchange(int* dest, const int& val) {
|
||||
return hc::atomic_exchange_int(dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned int atomic_exchange(unsigned int* dest, const unsigned int& val) {
|
||||
return hc::atomic_exchange_unsigned(dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t atomic_exchange(int64_t* dest, const int64_t& val) {
|
||||
return (int64_t)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t atomic_exchange(uint64_t* dest, const uint64_t& val) {
|
||||
return hc::atomic_exchange_uint64(dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long long atomic_exchange(long long* dest, const long long& val) {
|
||||
return (long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned long long atomic_exchange(unsigned long long* dest, const unsigned long long& val) {
|
||||
return (unsigned long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float atomic_exchange(float* dest, const float& val) {
|
||||
union U {
|
||||
int i ;
|
||||
float f ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,ival;
|
||||
idest.f = *dest;
|
||||
ival.f = val;
|
||||
idest.i = hc::atomic_exchange_int((int*)dest, ival.i);
|
||||
return idest.f;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double atomic_exchange(double* dest, const double& val) {
|
||||
union U {
|
||||
uint64_t i ;
|
||||
double d ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,ival;
|
||||
idest.d = *dest;
|
||||
ival.d = val;
|
||||
idest.i = hc::atomic_exchange_uint64((uint64_t*)dest, ival.i);
|
||||
return idest.d;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_compare_exchange(int* dest, int compare, const int& val);
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val);
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
|
||||
union U {
|
||||
int i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
assume.i = oldval.i ;
|
||||
newval.t = val ;
|
||||
atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
|
||||
union U {
|
||||
uint64_t i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
assume.i = oldval.i ;
|
||||
newval.t = val ;
|
||||
atomic_compare_exchange( (int64_t*)(dest) , assume.i, newval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_compare_exchange(int* dest, int compare, const int& val) {
|
||||
return hc::atomic_compare_exchange_int(dest, compare, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned int atomic_compare_exchange(unsigned int* dest, unsigned int compare, const unsigned int& val) {
|
||||
return hc::atomic_compare_exchange_unsigned(dest, compare, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val) {
|
||||
return (int64_t) hc::atomic_compare_exchange_uint64((uint64_t*)dest, (uint64_t)compare, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint64_t atomic_compare_exchange(uint64_t* dest, uint64_t compare, const uint64_t& val) {
|
||||
return hc::atomic_compare_exchange_uint64(dest, compare, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long long atomic_compare_exchange(long long* dest, long long compare, const long long& val) {
|
||||
return (long long)hc::atomic_compare_exchange_uint64((uint64_t*)(dest), (uint64_t)(compare), (const uint64_t&)(val));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float atomic_compare_exchange(float* dest, float compare, const float& val) {
|
||||
union U {
|
||||
int i ;
|
||||
float f ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,icompare,ival;
|
||||
idest.f = *dest;
|
||||
icompare.f = compare;
|
||||
ival.f = val;
|
||||
idest.i = hc::atomic_compare_exchange_int(reinterpret_cast<int*>(dest), icompare.i, ival.i);
|
||||
return idest.f;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double atomic_compare_exchange(double* dest, double compare, const double& val) {
|
||||
union U {
|
||||
uint64_t i ;
|
||||
double d ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,icompare,ival;
|
||||
idest.d = *dest;
|
||||
icompare.d = compare;
|
||||
ival.d = val;
|
||||
idest.i = hc::atomic_compare_exchange_uint64(reinterpret_cast<uint64_t*>(dest), icompare.i, ival.i);
|
||||
return idest.d;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
|
||||
union U {
|
||||
int i ;
|
||||
T f ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,icompare,ival;
|
||||
idest.f = *dest;
|
||||
icompare.f = compare;
|
||||
ival.f = val;
|
||||
idest.i = hc::atomic_compare_exchange_int((int*)(dest), icompare.i, ival.i);
|
||||
return idest.f;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int64_t), const T&>::type val) {
|
||||
union U {
|
||||
uint64_t i ;
|
||||
T f ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} idest,icompare,ival;
|
||||
idest.f = *dest;
|
||||
icompare.f = compare;
|
||||
ival.f = val;
|
||||
idest.i = hc::atomic_compare_exchange_uint64((uint64_t*)(dest), icompare.i, ival.i);
|
||||
return idest.f;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<(sizeof(T) != sizeof(int32_t)) && (sizeof(T) != sizeof(int64_t)), const T&>::type val) {
|
||||
return val;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_fetch_add (volatile int * dest, const int& val) {
|
||||
return hc::atomic_fetch_add((int *)dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned int atomic_fetch_add(unsigned int* dest, const unsigned int& val) {
|
||||
return hc::atomic_fetch_add(dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned long atomic_fetch_add(volatile unsigned long* dest, const unsigned long& val) {
|
||||
return (unsigned long)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t atomic_fetch_add(volatile int64_t* dest, const int64_t& val) {
|
||||
return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
char atomic_fetch_add(volatile char * dest, const char& val) {
|
||||
unsigned int oldval,newval,assume;
|
||||
oldval = *(int *)dest ;
|
||||
|
||||
do {
|
||||
assume = oldval ;
|
||||
newval = assume&0x7fffff00 + ((assume&0xff)+val)&0xff ;
|
||||
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
|
||||
} while ( assume != oldval );
|
||||
|
||||
return oldval ;
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
short atomic_fetch_add(volatile short * dest, const short& val) {
|
||||
unsigned int oldval,newval,assume;
|
||||
oldval = *(int *)dest ;
|
||||
|
||||
do {
|
||||
assume = oldval ;
|
||||
newval = assume&0x7fff0000 + ((assume&0xffff)+val)&0xffff ;
|
||||
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
|
||||
} while ( assume != oldval );
|
||||
|
||||
return oldval ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long long atomic_fetch_add(volatile long long * dest, const long long& val) {
|
||||
return (long long)hc::atomic_fetch_add((uint64_t*)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int atomic_fetch_sub (volatile int * dest, const int& val) {
|
||||
return hc::atomic_fetch_sub((int *)dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned int atomic_fetch_sub(volatile unsigned int* dest, const unsigned int& val) {
|
||||
return hc::atomic_fetch_sub((unsigned int *)dest, val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int64_t atomic_fetch_sub(int64_t* dest, const int64_t& val) {
|
||||
return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, -(const uint64_t&)val);
|
||||
// return (int64_t)hc::atomic_fetch_sub_uint64((uint64_t*)dest, (const uint64_t&)val);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
char atomic_fetch_sub(volatile char * dest, const char& val) {
|
||||
unsigned int oldval,newval,assume;
|
||||
oldval = *(int *)dest ;
|
||||
|
||||
do {
|
||||
assume = oldval ;
|
||||
newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
|
||||
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
|
||||
} while ( assume != oldval );
|
||||
|
||||
return oldval ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
short atomic_fetch_sub(volatile short * dest, const short& val) {
|
||||
unsigned int oldval,newval,assume;
|
||||
oldval = *(int *)dest ;
|
||||
|
||||
do {
|
||||
assume = oldval ;
|
||||
newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
|
||||
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
|
||||
} while ( assume != oldval );
|
||||
|
||||
return oldval ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long long atomic_fetch_sub(volatile long long * dest, const long long& val) {
|
||||
return (long long)hc::atomic_fetch_add((uint64_t*)dest, -(const uint64_t&)val);
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
|
||||
union U {
|
||||
unsigned int i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
assume.i = oldval.i ;
|
||||
newval.t = assume.t + val ;
|
||||
oldval.i = atomic_compare_exchange( (unsigned int*)(dest) , assume.i , newval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
|
||||
union U {
|
||||
uint64_t i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
assume.i = oldval.i ;
|
||||
newval.t = assume.t + val ;
|
||||
oldval.i = atomic_compare_exchange( (uint64_t*)dest , assume.i , newval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
|
||||
//WORKAROUND
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
|
||||
return val ;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int),T>::type & val) {
|
||||
union U {
|
||||
int i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
assume.i = oldval.i ;
|
||||
newval.t = assume.t - val ;
|
||||
oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
|
||||
union U {
|
||||
int64_t i ;
|
||||
T t ;
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
assume.i = oldval.i ;
|
||||
newval.t = assume.t - val ;
|
||||
oldval.i = atomic_compare_exchange( (int64_t*)dest , assume.i , newval.i );
|
||||
} while ( assume.i != oldval.i );
|
||||
|
||||
return oldval.t ;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
51
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
Normal file
51
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
Normal file
@ -0,0 +1,51 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
|
||||
#define GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
|
||||
|
||||
#ifndef KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
#define KOKKOS_ROCM_HAS_WORKAROUNDS 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
133
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
Normal file
133
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
Normal file
@ -0,0 +1,133 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_ROCMEXEC_HPP
|
||||
#define KOKKOS_ROCMEXEC_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
//#include <ROCm/Kokkos_ROCmExec.hpp>
|
||||
#include <hc.hpp>
|
||||
|
||||
#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
|
||||
#define ROCM_CONCURRENCY 20480
|
||||
//#define ROCM_CONCURRENCY 81920 # for fiji
|
||||
|
||||
namespace Kokkos {
|
||||
static int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
|
||||
static int rocm_space_scratch_locks[ROCM_CONCURRENCY];
|
||||
static int rocm_space_threadid_locks[ROCM_CONCURRENCY];
|
||||
namespace Impl {
|
||||
// TODO: mimic cuda implemtation, add dgpu capability
|
||||
|
||||
void init_rocm_atomic_lock_array() {
|
||||
static int is_initialized = 0;
|
||||
if(!is_initialized)
|
||||
{
|
||||
for(int i = 0; i < ROCM_SPACE_ATOMIC_MASK+1; i++)
|
||||
rocm_space_atomic_locks[i] = 0;
|
||||
is_initialized = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void init_rocm_scratch_lock_array() {
|
||||
static int is_initialized = 0;
|
||||
if(!is_initialized)
|
||||
{
|
||||
for(int i = 0; i < ROCM_CONCURRENCY; i++)
|
||||
rocm_space_scratch_locks[i] = 0;
|
||||
is_initialized = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void init_rocm_threadid_lock_array() {
|
||||
static int is_initialized = 0;
|
||||
if(!is_initialized)
|
||||
{
|
||||
for(int i = 0; i < ROCM_CONCURRENCY; i++)
|
||||
rocm_space_threadid_locks[i] = 0;
|
||||
is_initialized = 1;
|
||||
}
|
||||
}
|
||||
|
||||
void init_lock_arrays_rocm_space() {
|
||||
init_rocm_atomic_lock_array();
|
||||
// init_rocm_scratch_lock_array();
|
||||
// init_rocm_threadid_lock_array();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
#if 0
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool lock_address_rocm_space(void* ptr) {
|
||||
#if 0
|
||||
return(Kokkos::Impl::lock_address_host_space(ptr));
|
||||
#else
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & ROCM_SPACE_ATOMIC_MASK;
|
||||
return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void unlock_address_rocm_space(void* ptr) {
|
||||
#if 0
|
||||
Kokkos::Impl::unlock_address_host_space(ptr) ;
|
||||
#else
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & ROCM_SPACE_ATOMIC_MASK;
|
||||
hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
#endif
|
||||
|
||||
#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
|
||||
137
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
Normal file
137
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
Normal file
@ -0,0 +1,137 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_ROCMEXEC_HPP
|
||||
#define KOKKOS_ROCMEXEC_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <typeinfo>
|
||||
|
||||
#if defined(__HCC_ACCELERATOR__)
|
||||
#define printf(...)
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
struct ROCmTraits {
|
||||
// TODO: determine if needed
|
||||
enum { WavefrontSize = 64 /* 64 */ };
|
||||
enum { WorkgroupSize = 64 /* 64 */ };
|
||||
enum { WavefrontIndexMask = 0x001f /* Mask for warpindex */ };
|
||||
enum { WavefrontIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
|
||||
|
||||
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
|
||||
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
|
||||
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
|
||||
|
||||
enum { UpperBoundExtentCount = 65535 /* Hard upper bound */ };
|
||||
#if 0
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
ROCmSpace::size_type wavefront_count( ROCmSpace::size_type i )
|
||||
{ return ( i + WavefrontIndexMask ) >> WavefrontIndexShift ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
ROCmSpace::size_type wavefront_align( ROCmSpace::size_type i )
|
||||
{
|
||||
enum { Mask = ~ROCmSpace::size_type( WavefrontIndexMask ) };
|
||||
return ( i + WavefrontIndexMask ) & Mask ;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
size_t rocm_internal_cu_count();
|
||||
size_t rocm_internal_maximum_workgroup_count();
|
||||
|
||||
size_t * rocm_internal_scratch_flags( const size_t size );
|
||||
size_t * rocm_internal_scratch_space( const size_t size );
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
|
||||
#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
|
||||
//int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
|
||||
extern int
|
||||
*rocm_space_atomic_locks;
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void init_lock_arrays_rocm_space();
|
||||
|
||||
void* rocm_resize_scratch_space(size_t bytes, bool force_shrink = false);
|
||||
|
||||
// TODO: determine if needed
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool lock_address_rocm_space(void* ptr) {
|
||||
#if 0
|
||||
return(Kokkos::Impl::lock_address_host_space(ptr));
|
||||
#else
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & ROCM_SPACE_ATOMIC_MASK;
|
||||
return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
|
||||
#endif
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void unlock_address_rocm_space(void* ptr) {
|
||||
#if 0
|
||||
Kokkos::Impl::unlock_address_host_space(ptr) ;
|
||||
#else
|
||||
size_t offset = size_t(ptr);
|
||||
offset = offset >> 2;
|
||||
offset = offset & ROCM_SPACE_ATOMIC_MASK;
|
||||
hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
//extern
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
//void init_lock_arrays_rocm_space();
|
||||
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
|
||||
753
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
Normal file
753
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
Normal file
@ -0,0 +1,753 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Kokkos interfaces */
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
/* only compile this file if ROCM is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
|
||||
//#include <ROCm/Kokkos_ROCm_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Kokkos_ROCmSpace.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Exec.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Standard 'C' libraries */
|
||||
#include <stdlib.h>
|
||||
|
||||
/* Standard 'C++' libraries */
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
|
||||
|
||||
//KOKKOS_INLINE_FUNCTION
|
||||
// Kokkos::Impl::ROCmLockArraysStruct kokkos_impl_rocm_lock_arrays ;
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
#if 0
|
||||
namespace {
|
||||
__global__
|
||||
void query_rocm_kernel_arch( int * d_arch )
|
||||
{
|
||||
#if defined( __HCC_ACCELERATOR__ )
|
||||
*d_arch = OCM_ARCH__ ;
|
||||
#else
|
||||
*d_arch = 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/** Query what compute capability is actually launched to the device: */
|
||||
int rocm_kernel_arch()
|
||||
{
|
||||
int * d_arch = 0 ;
|
||||
rocmMalloc( (void **) & d_arch , sizeof(int) );
|
||||
query_rocm_kernel_arch<<<1,1>>>( d_arch );
|
||||
int arch = 0 ;
|
||||
rocmMemcpy( & arch , d_arch , sizeof(int) , rocmMemcpyDefault );
|
||||
rocmFree( d_arch );
|
||||
return arch ;
|
||||
}
|
||||
bool rocm_launch_blocking()
|
||||
{
|
||||
const char * env = getenv("ROCM_LAUNCH_BLOCKING");
|
||||
|
||||
if (env == 0) return false;
|
||||
|
||||
return atoi(env);
|
||||
}
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
// true device memory allocation, not visible from host
|
||||
void * rocm_device_allocate(int size)
|
||||
{
|
||||
void * ptr;
|
||||
hc::accelerator acc;
|
||||
ptr = hc::am_alloc(size,acc,0);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
// host pinned allocation
|
||||
// flag = 1, non-coherent, host resident, but with gpu address space pointer
|
||||
// flag = 2, coherent, host resident, but with host address space pointer
|
||||
void * rocm_hostpinned_allocate(int size)
|
||||
{
|
||||
void * ptr;
|
||||
hc::accelerator acc;
|
||||
ptr = hc::am_alloc(size,acc,2);
|
||||
return ptr;
|
||||
}
|
||||
// same free used by all rocm memory allocations
|
||||
void rocm_device_free(void * ptr)
|
||||
{
|
||||
hc::am_free(ptr);
|
||||
}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_device_synchronize()
|
||||
{
|
||||
hc::accelerator_view av = hc::accelerator().get_default_view();
|
||||
hc::completion_future fut = av.create_marker();
|
||||
fut.wait();
|
||||
}
|
||||
|
||||
void rocm_internal_error_throw( const char * name, const char * file, const int line )
|
||||
{
|
||||
#if 0
|
||||
std::ostringstream out ;
|
||||
out << name << " error( " << rocmGetErrorName(e) << "): " << rocmGetErrorString(e);
|
||||
if (file) {
|
||||
out << " " << file << ":" << line;
|
||||
}
|
||||
throw_runtime_exception( out.str() );
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Some significant rocm device properties:
|
||||
//
|
||||
// rocmDeviceProp::name : Text label for device
|
||||
// rocmDeviceProp::major : Device major number
|
||||
// rocmDeviceProp::minor : Device minor number
|
||||
// rocmDeviceProp::workgroupSize : number of threads per workgroup
|
||||
// rocmDeviceProp::multiProcessorCount : number of multiprocessors
|
||||
// rocmDeviceProp::sharedMemPerBlock : capacity of shared memory per wavefront
|
||||
// rocmDeviceProp::totalConstMem : capacity of constant memory
|
||||
// rocmDeviceProp::totalGlobalMem : capacity of global memory
|
||||
// rocmDeviceProp::maxGridSize[3] : maximum grid size
|
||||
|
||||
//
|
||||
//
|
||||
// the data we have available from a ROCm accelerator
|
||||
// std::wstring get_device_path()
|
||||
// std::wstring get_description()
|
||||
// unsigned int get_version()
|
||||
// bool get_has_display()
|
||||
// size_t get_dedicated_memory()
|
||||
// bool get_supports_double_precision()
|
||||
// bool get_supports_limited_double_precision()
|
||||
// bool get_is_debug()
|
||||
// bool get_supports_cpu_shared_memory()
|
||||
// size_t get_max_tile_static_size()
|
||||
// unsigned int get_cu_count()
|
||||
// bool has_cpu_accessible_am()
|
||||
struct rocmDeviceProp {
|
||||
char name[256];
|
||||
char description[256];
|
||||
unsigned int version;
|
||||
int device_type;
|
||||
int device_ordinal;
|
||||
int major;
|
||||
int minor;
|
||||
size_t totalGlobalMem;
|
||||
size_t sharedMemPerWavefront;
|
||||
int WavefrontSize;
|
||||
int WorkgroupSize;
|
||||
int MaxTileCount;
|
||||
int maxThreadsPerWorkgroup;
|
||||
int multiProcessorCount;
|
||||
int canMapHostMemory;
|
||||
bool APU;
|
||||
};
|
||||
|
||||
|
||||
|
||||
void rocmGetDeviceProperties(struct rocmDeviceProp* devProp, int device)
|
||||
{
|
||||
std::wstring s;
|
||||
int i,n;
|
||||
hc::accelerator acc;
|
||||
std::vector<hc::accelerator> accv = acc.get_all() ;
|
||||
|
||||
hc::accelerator a = accv[device];
|
||||
|
||||
s=a.get_device_path();
|
||||
i = 0;
|
||||
for(wchar_t c: s)
|
||||
if((n=std::wctomb(&devProp->name[i],c))>0)
|
||||
i+=n;
|
||||
|
||||
/* assume a CPU */
|
||||
devProp->version = a.get_version();
|
||||
devProp->major = a.get_version()>>16; // for CPU, these are meaningless
|
||||
devProp->minor = a.get_version()&0xff;
|
||||
devProp->device_ordinal = 0;
|
||||
|
||||
/* is this an AMD graphics card */
|
||||
if((devProp->name[0]=='g') && (devProp->name[1]=='f')
|
||||
&& (devProp->name[2]=='x')) {
|
||||
/* for AMD cards, the name has the format gfxMmmO */
|
||||
|
||||
devProp->device_type = ((devProp->name[3]-0x30)<<16)
|
||||
+ ((devProp->name[4]-0x30)<<8)
|
||||
+ (devProp->name[5]-0x30);
|
||||
devProp->device_ordinal = devProp->name[6]-0x30;
|
||||
devProp->major = devProp->name[3]-0x30;
|
||||
devProp->minor = devProp->name[5]-0x30;
|
||||
}
|
||||
|
||||
s=a.get_description();
|
||||
i = 0;
|
||||
for(wchar_t c: s)
|
||||
if((n=std::wctomb(&devProp->description[i],c))>0)
|
||||
i+=n;
|
||||
devProp->totalGlobalMem = a.get_dedicated_memory();
|
||||
devProp->sharedMemPerWavefront = a.get_max_tile_static_size();
|
||||
devProp->WavefrontSize = 64;
|
||||
devProp->WorkgroupSize = 256; // preferred
|
||||
devProp->MaxTileCount = 409600; // as defined in /opt/rocm/hcc-lc/include/hsa_new.h
|
||||
devProp->maxThreadsPerWorkgroup = 1024;
|
||||
devProp->multiProcessorCount = a.get_cu_count();
|
||||
devProp->canMapHostMemory = a.get_supports_cpu_shared_memory();
|
||||
// Kaveri has 64KB L2 per CU, 16KB L1, 64KB Vector Regs/SIMD, or 128 regs/thread
|
||||
// GCN has 64KB LDS per CU
|
||||
|
||||
//Kaveri APU is 7:0:0
|
||||
//Carrizo APU is 8:0:1
|
||||
devProp->APU = (((devProp->major==7)&&(devProp->minor==0))|
|
||||
((devProp->major==8)&&(devProp->minor==1)))?true:false;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
|
||||
|
||||
class ROCmInternalDevices {
|
||||
public:
|
||||
enum { MAXIMUM_DEVICE_COUNT = 64 };
|
||||
struct rocmDeviceProp m_rocmProp[ MAXIMUM_DEVICE_COUNT ] ;
|
||||
int m_rocmDevCount ;
|
||||
|
||||
ROCmInternalDevices();
|
||||
|
||||
static const ROCmInternalDevices & singleton();
|
||||
};
|
||||
|
||||
ROCmInternalDevices::ROCmInternalDevices()
|
||||
{
|
||||
hc::accelerator acc;
|
||||
std::vector<hc::accelerator> accv = acc.get_all() ;
|
||||
m_rocmDevCount = accv.size();
|
||||
|
||||
if(m_rocmDevCount > MAXIMUM_DEVICE_COUNT) {
|
||||
Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
|
||||
}
|
||||
for ( int i = 0 ; i < m_rocmDevCount ; ++i ) {
|
||||
rocmGetDeviceProperties( m_rocmProp + i , i );
|
||||
}
|
||||
}
|
||||
|
||||
const ROCmInternalDevices & ROCmInternalDevices::singleton()
|
||||
{
|
||||
static ROCmInternalDevices* self = nullptr;
|
||||
if (!self) {
|
||||
self = new ROCmInternalDevices();
|
||||
}
|
||||
return *self;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
class ROCmInternal {
|
||||
private:
|
||||
|
||||
ROCmInternal( const ROCmInternal & );
|
||||
ROCmInternal & operator = ( const ROCmInternal & );
|
||||
|
||||
|
||||
public:
|
||||
|
||||
typedef Kokkos::Experimental::ROCm::size_type size_type ;
|
||||
|
||||
int m_rocmDev ;
|
||||
int m_rocmArch ;
|
||||
unsigned m_multiProcCount ;
|
||||
unsigned m_maxWorkgroup ;
|
||||
unsigned m_maxSharedWords ;
|
||||
size_type m_scratchSpaceCount ;
|
||||
size_type m_scratchFlagsCount ;
|
||||
size_type * m_scratchSpace ;
|
||||
size_type * m_scratchFlags ;
|
||||
|
||||
static int was_finalized;
|
||||
|
||||
static ROCmInternal & singleton();
|
||||
|
||||
int verify_is_initialized( const char * const label ) const ;
|
||||
|
||||
int is_initialized() const
|
||||
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
|
||||
|
||||
void initialize( int rocm_device_id );
|
||||
void finalize();
|
||||
|
||||
void print_configuration( std::ostream & ) const ;
|
||||
|
||||
|
||||
~ROCmInternal();
|
||||
|
||||
ROCmInternal()
|
||||
: m_rocmDev( -1 )
|
||||
, m_rocmArch( -1 )
|
||||
, m_multiProcCount( 0 )
|
||||
, m_maxWorkgroup( 0 )
|
||||
, m_maxSharedWords( 0 )
|
||||
, m_scratchSpaceCount( 0 )
|
||||
, m_scratchFlagsCount( 0 )
|
||||
, m_scratchSpace( 0 )
|
||||
, m_scratchFlags( 0 )
|
||||
{}
|
||||
|
||||
size_type * scratch_space( const size_type size );
|
||||
size_type * scratch_flags( const size_type size );
|
||||
};
|
||||
|
||||
int ROCmInternal::was_finalized = 0;
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
void ROCmInternal::print_configuration( std::ostream & s ) const
|
||||
{
|
||||
const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
s << "macro KOKKOS_ENABLE_ROCM : defined" << std::endl ;
|
||||
#endif
|
||||
#if defined( __hcc_version__ )
|
||||
s << "macro __hcc_version__ = " << __hcc_version__
|
||||
<< std::endl ;
|
||||
#endif
|
||||
|
||||
for ( int i = 0 ; i < dev_info.m_rocmDevCount ; ++i ) {
|
||||
s << "Kokkos::Experimental::ROCm[ " << i << " ] "
|
||||
<< dev_info.m_rocmProp[i].name
|
||||
<< " version " << (dev_info.m_rocmProp[i].major) << "." << dev_info.m_rocmProp[i].minor
|
||||
<< ", Total Global Memory: " << human_memory_size(dev_info.m_rocmProp[i].totalGlobalMem)
|
||||
<< ", Shared Memory per Wavefront: " << human_memory_size(dev_info.m_rocmProp[i].sharedMemPerWavefront);
|
||||
if ( m_rocmDev == i ) s << " : Selected" ;
|
||||
s << std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
ROCmInternal::~ROCmInternal()
|
||||
{
|
||||
if ( m_scratchSpace ||
|
||||
m_scratchFlags ) {
|
||||
std::cerr << "Kokkos::Experimental::ROCm ERROR: Failed to call Kokkos::Experimental::ROCm::finalize()"
|
||||
<< std::endl ;
|
||||
std::cerr.flush();
|
||||
}
|
||||
|
||||
m_rocmDev = -1 ;
|
||||
m_rocmArch = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
m_maxWorkgroup = 0 ;
|
||||
m_maxSharedWords = 0 ;
|
||||
m_scratchSpaceCount = 0 ;
|
||||
m_scratchFlagsCount = 0 ;
|
||||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
}
|
||||
|
||||
int ROCmInternal::verify_is_initialized( const char * const label ) const
|
||||
{
|
||||
if ( m_rocmDev < 0 ) {
|
||||
std::cerr << "Kokkos::Experimental::ROCm::" << label << " : ERROR device not initialized" << std::endl ;
|
||||
}
|
||||
return 0 <= m_rocmDev ;
|
||||
}
|
||||
|
||||
ROCmInternal & ROCmInternal::singleton()
|
||||
{
|
||||
static ROCmInternal* self = nullptr ;
|
||||
if (!self) {
|
||||
self = new ROCmInternal();
|
||||
}
|
||||
return *self ;
|
||||
|
||||
}
|
||||
|
||||
void ROCmInternal::initialize( int rocm_device_id )
|
||||
{
|
||||
if ( was_finalized ) Kokkos::abort("Calling ROCm::initialize after ROCm::finalize is illegal\n");
|
||||
|
||||
if ( is_initialized() ) return;
|
||||
|
||||
enum { WordSize = sizeof(size_type) };
|
||||
|
||||
if ( ! HostSpace::execution_space::is_initialized() ) {
|
||||
const std::string msg("ROCm::initialize ERROR : HostSpace::execution_space is not initialized");
|
||||
throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
|
||||
|
||||
const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
|
||||
|
||||
const bool ok_id = 1 <= rocm_device_id &&
|
||||
rocm_device_id < dev_info.m_rocmDevCount ;
|
||||
|
||||
// Need at least a GPU device
|
||||
|
||||
const bool ok_dev = ok_id &&
|
||||
( 1 <= dev_info.m_rocmProp[ rocm_device_id ].major &&
|
||||
0 <= dev_info.m_rocmProp[ rocm_device_id ].minor );
|
||||
if ( ok_init && ok_dev ) {
|
||||
|
||||
const struct rocmDeviceProp & rocmProp =
|
||||
dev_info.m_rocmProp[ rocm_device_id ];
|
||||
|
||||
m_rocmDev = rocm_device_id ;
|
||||
|
||||
// rocmSetDevice( m_rocmDev ) );
|
||||
Kokkos::Impl::rocm_device_synchronize();
|
||||
|
||||
/*
|
||||
// Query what compute capability architecture a kernel executes:
|
||||
m_rocmArch = rocm_kernel_arch();
|
||||
if ( m_rocmArch != rocmProp.major * 100 + rocmProp.minor * 10 ) {
|
||||
std::cerr << "Kokkos::Experimental::ROCm::initialize WARNING: running kernels compiled for compute capability "
|
||||
<< ( m_rocmArch / 100 ) << "." << ( ( m_rocmArch % 100 ) / 10 )
|
||||
<< " on device with compute capability "
|
||||
<< rocmProp.major << "." << rocmProp.minor
|
||||
<< " , this will likely reduce potential performance."
|
||||
<< std::endl ;
|
||||
}
|
||||
*/
|
||||
// number of multiprocessors
|
||||
|
||||
m_multiProcCount = rocmProp.multiProcessorCount ;
|
||||
|
||||
//----------------------------------
|
||||
// Maximum number of wavefronts,
|
||||
// at most one workgroup per thread in a workgroup for reduction.
|
||||
|
||||
|
||||
m_maxSharedWords = rocmProp.sharedMemPerWavefront/ WordSize ;
|
||||
|
||||
//----------------------------------
|
||||
// Maximum number of Workgroups:
|
||||
|
||||
m_maxWorkgroup = 5*rocmProp.multiProcessorCount; //TODO: confirm usage and value
|
||||
|
||||
//----------------------------------
|
||||
// Multiblock reduction uses scratch flags for counters
|
||||
// and scratch space for partial reduction values.
|
||||
// Allocate some initial space. This will grow as needed.
|
||||
|
||||
{
|
||||
const unsigned reduce_block_count = m_maxWorkgroup * Impl::ROCmTraits::WorkgroupSize ;
|
||||
|
||||
(void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) );
|
||||
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
|
||||
}
|
||||
//----------------------------------
|
||||
|
||||
}
|
||||
else {
|
||||
|
||||
std::ostringstream msg ;
|
||||
msg << "Kokkos::Experimental::ROCm::initialize(" << rocm_device_id << ") FAILED" ;
|
||||
|
||||
if ( ! ok_init ) {
|
||||
msg << " : Already initialized" ;
|
||||
}
|
||||
if ( ! ok_id ) {
|
||||
msg << " : Device identifier out of range "
|
||||
<< "[0.." << (dev_info.m_rocmDevCount-1) << "]" ;
|
||||
}
|
||||
else if ( ! ok_dev ) {
|
||||
msg << " : Device " ;
|
||||
msg << dev_info.m_rocmProp[ rocm_device_id ].major ;
|
||||
msg << "." ;
|
||||
msg << dev_info.m_rocmProp[ rocm_device_id ].minor ;
|
||||
msg << " Need at least a GPU" ;
|
||||
msg << std::endl;
|
||||
}
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Kokkos::Impl::init_lock_arrays_rocm_space();
|
||||
|
||||
// Kokkos::Impl::ROCmLockArraysStruct locks;
|
||||
// locks.atomic = atomic_lock_array_rocm_space_ptr(false);
|
||||
// locks.scratch = scratch_lock_array_rocm_space_ptr(false);
|
||||
// locks.threadid = threadid_lock_array_rocm_space_ptr(false);
|
||||
// rocmMemcpyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
typedef Kokkos::Experimental::ROCm::size_type ScratchGrain[ Impl::ROCmTraits::WorkgroupSize ] ;
|
||||
enum { sizeScratchGrain = sizeof(ScratchGrain) };
|
||||
|
||||
void rocmMemset( Kokkos::Experimental::ROCm::size_type * ptr , Kokkos::Experimental::ROCm::size_type value , Kokkos::Experimental::ROCm::size_type size)
|
||||
{
|
||||
char * mptr = (char * ) ptr;
|
||||
#if 0
|
||||
parallel_for_each(hc::extent<1>(size),
|
||||
[=, &ptr]
|
||||
(hc::index<1> idx) __HC__
|
||||
{
|
||||
int i = idx[0];
|
||||
ptr[i] = value;
|
||||
}).wait();
|
||||
#else
|
||||
for (int i= 0; i<size ; i++)
|
||||
{
|
||||
mptr[i] = (char) value;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type *
|
||||
ROCmInternal::scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
|
||||
{
|
||||
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
|
||||
|
||||
|
||||
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace()
|
||||
, "InternalScratchFlags"
|
||||
, ( sizeScratchGrain * m_scratchFlagsCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
rocmMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain );
|
||||
}
|
||||
|
||||
return m_scratchFlags ;
|
||||
}
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type *
|
||||
ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
|
||||
{
|
||||
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
|
||||
|
||||
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace()
|
||||
, "InternalScratchSpace"
|
||||
, ( sizeScratchGrain * m_scratchSpaceCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
|
||||
}
|
||||
|
||||
return m_scratchSpace ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void ROCmInternal::finalize()
|
||||
{
|
||||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
// atomic_lock_array_rocm_space_ptr(false);
|
||||
// scratch_lock_array_rocm_space_ptr(false);
|
||||
// threadid_lock_array_rocm_space_ptr(false);
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< HostSpace > RecordROCm ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace > RecordHost ;
|
||||
|
||||
RecordROCm::decrement( RecordROCm::get_record( m_scratchFlags ) );
|
||||
RecordROCm::decrement( RecordROCm::get_record( m_scratchSpace ) );
|
||||
|
||||
m_rocmDev = -1 ;
|
||||
m_multiProcCount = 0 ;
|
||||
m_maxWorkgroup = 0 ;
|
||||
m_maxSharedWords = 0 ;
|
||||
m_scratchSpaceCount = 0 ;
|
||||
m_scratchFlagsCount = 0 ;
|
||||
m_scratchSpace = 0 ;
|
||||
m_scratchFlags = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type rocm_internal_cu_count()
|
||||
{ return ROCmInternal::singleton().m_multiProcCount ; }
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_extent_size()
|
||||
{ return ROCmInternal::singleton().m_maxWorkgroup ; }
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_shared_words()
|
||||
{ return ROCmInternal::singleton().m_maxSharedWords ; }
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_space( const Kokkos::Experimental::ROCm::size_type size )
|
||||
{ return ROCmInternal::singleton().scratch_space( size ); }
|
||||
|
||||
Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
|
||||
{ return ROCmInternal::singleton().scratch_flags( size ); }
|
||||
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
//ROCm::size_type ROCm::detect_device_count()
|
||||
//{ return Impl::ROCmInternalDevices::singleton().m_rocmDevCount ; }
|
||||
|
||||
int ROCm::concurrency() {
|
||||
#if defined(KOKKOS_ARCH_KAVERI)
|
||||
return 8*64*40; // 20480 kaveri
|
||||
#else
|
||||
return 32*8*40; // 81920 fiji and hawaii
|
||||
#endif
|
||||
}
|
||||
int ROCm::is_initialized()
|
||||
{ return Kokkos::Impl::ROCmInternal::singleton().is_initialized(); }
|
||||
|
||||
void ROCm::initialize( const ROCm::SelectDevice config )
|
||||
{
|
||||
Kokkos::Impl::ROCmInternal::singleton().initialize( config.rocm_device_id );
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
#if 0
|
||||
std::vector<unsigned>
|
||||
ROCm::detect_device_arch()
|
||||
{
|
||||
const Impl::ROCmInternalDevices & s = Impl::ROCmInternalDevices::singleton();
|
||||
|
||||
std::vector<unsigned> output( s.m_rocmDevCount );
|
||||
|
||||
for ( int i = 0 ; i < s.m_rocmDevCount ; ++i ) {
|
||||
output[i] = s.m_rocmProp[i].major * 100 + s.m_rocmProp[i].minor ;
|
||||
}
|
||||
|
||||
return output ;
|
||||
}
|
||||
|
||||
ROCm::size_type ROCm::device_arch()
|
||||
{
|
||||
return 1 ;
|
||||
}
|
||||
#endif
|
||||
|
||||
void ROCm::finalize()
|
||||
{
|
||||
Kokkos::Impl::ROCmInternal::singleton().finalize();
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
ROCm::ROCm()
|
||||
: m_device( Kokkos::Impl::ROCmInternal::singleton().m_rocmDev )
|
||||
{
|
||||
Kokkos::Impl::ROCmInternal::singleton().verify_is_initialized( "ROCm instance constructor" );
|
||||
}
|
||||
|
||||
bool ROCm::isAPU(int device) {
|
||||
const Kokkos::Impl::ROCmInternalDevices & dev_info =
|
||||
Kokkos::Impl::ROCmInternalDevices::singleton();
|
||||
return (dev_info.m_rocmProp[device].APU);
|
||||
}
|
||||
|
||||
bool ROCm::isAPU() {
|
||||
return ROCm::isAPU(rocm_device());
|
||||
}
|
||||
|
||||
//ROCm::ROCm( const int instance_id )
|
||||
// : m_device( Impl::ROCmInternal::singleton().m_rocmDev )
|
||||
//{}
|
||||
|
||||
void ROCm::print_configuration( std::ostream & s , const bool )
|
||||
{ Kokkos::Impl::ROCmInternal::singleton().print_configuration( s ); }
|
||||
|
||||
bool ROCm::sleep() { return false ; }
|
||||
|
||||
bool ROCm::wake() { return true ; }
|
||||
|
||||
void ROCm::fence()
|
||||
{
|
||||
Kokkos::Impl::rocm_device_synchronize();
|
||||
}
|
||||
|
||||
const char* ROCm::name() { return "ROCm"; }
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_ENABLE_ROCM
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
138
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
Normal file
138
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
Normal file
@ -0,0 +1,138 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <type_traits>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if !defined( KOKKOS_ROCM_INVOKE_H )
|
||||
#define KOKKOS_ROCM_INVOKE_H
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class Tag, class F, class... Ts, typename std::enable_if<(!std::is_void<Tag>()), int>::type = 0>
|
||||
KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
|
||||
{
|
||||
f(Tag(), static_cast<Ts&&>(xs)...);
|
||||
}
|
||||
|
||||
template<class Tag, class F, class... Ts, typename std::enable_if<(std::is_void<Tag>()), int>::type = 0>
|
||||
KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
|
||||
{
|
||||
f(static_cast<Ts&&>(xs)...);
|
||||
}
|
||||
|
||||
|
||||
template<class F, class Tag=void>
|
||||
struct rocm_invoke_fn
|
||||
{
|
||||
F* f;
|
||||
rocm_invoke_fn(F& f_) : f(&f_)
|
||||
{}
|
||||
|
||||
template<class... Ts>
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
|
||||
{
|
||||
rocm_invoke<Tag>(*f, static_cast<Ts&&>(xs)...);
|
||||
}
|
||||
};
|
||||
|
||||
template<class Tag, class F>
|
||||
KOKKOS_INLINE_FUNCTION rocm_invoke_fn<F, Tag> make_rocm_invoke_fn(F& f)
|
||||
{
|
||||
return {f};
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper<T> x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template<class F, class T>
|
||||
struct rocm_capture_fn
|
||||
{
|
||||
F f;
|
||||
T data;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x)
|
||||
: f(f_), data(x)
|
||||
{}
|
||||
|
||||
template<class... Ts>
|
||||
KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
|
||||
{
|
||||
f(rocm_unwrap(data), static_cast<Ts&&>(xs)...);
|
||||
}
|
||||
};
|
||||
|
||||
template<class F, class T>
|
||||
KOKKOS_INLINE_FUNCTION rocm_capture_fn<F, T> rocm_capture(F f, T x)
|
||||
{
|
||||
return {f, x};
|
||||
}
|
||||
|
||||
template<class F, class T, class U, class... Ts>
|
||||
KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) -> decltype(rocm_capture(rocm_capture(f, x), y, xs...))
|
||||
{
|
||||
return rocm_capture(rocm_capture(f, x), y, xs...);
|
||||
}
|
||||
|
||||
struct rocm_apply_op
|
||||
{
|
||||
template<class F, class... Ts>
|
||||
KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const
|
||||
{
|
||||
f(static_cast<Ts&&>(xs)...);
|
||||
}
|
||||
};
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
72
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
Normal file
72
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
Normal file
@ -0,0 +1,72 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if !defined( KOKKOS_ROCM_JOIN_H )
|
||||
#define KOKKOS_ROCM_JOIN_H
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
// Adaptor to use ValueJoin with standard algorithms
|
||||
template<class Joiner, class F>
|
||||
struct join_operator
|
||||
{
|
||||
const F* fp;
|
||||
template<class T, class U>
|
||||
T operator()(T x, const U& y) const
|
||||
{
|
||||
Joiner::join(*fp, &x, &y);
|
||||
return x;
|
||||
}
|
||||
};
|
||||
|
||||
template<class Joiner, class F>
|
||||
join_operator<Joiner, F> make_join_operator(const F& f)
|
||||
{
|
||||
return join_operator<Joiner, F>{&f};
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
1265
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
Normal file
1265
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
Normal file
File diff suppressed because it is too large
Load Diff
193
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
Normal file
193
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
Normal file
@ -0,0 +1,193 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// AMP REDUCE
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !defined( KOKKOS_ROCM_AMP_REDUCE_INL )
|
||||
#define KOKKOS_ROCM_AMP_REDUCE_INL
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <type_traits>
|
||||
#include <ROCm/Kokkos_ROCm_Tile.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Invoke.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Join.hpp>
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class T>
|
||||
T* reduce_value(T* x, std::true_type) [[hc]]
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
T& reduce_value(T* x, std::false_type) [[hc]]
|
||||
{
|
||||
return *x;
|
||||
}
|
||||
|
||||
#if KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
struct always_true
|
||||
{
|
||||
template<class... Ts>
|
||||
bool operator()(Ts&&...) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
template< class Tag, class F, class ReducerType, class Invoker, class T >
|
||||
void reduce_enqueue(
|
||||
const int szElements, // size of the extent
|
||||
const F & f,
|
||||
const ReducerType& reducer,
|
||||
Invoker invoke,
|
||||
T * const output_result,
|
||||
int const output_length,
|
||||
const int team_size=64,
|
||||
const int vector_size=1,
|
||||
int const shared_size=0)
|
||||
{
|
||||
using namespace hc ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, F, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , Tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , Tag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , Tag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , Tag > ValueFinal ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
if (output_length < 1) return;
|
||||
|
||||
assert(output_result != nullptr);
|
||||
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
|
||||
|
||||
// allocate host and device memory for the results from each team
|
||||
std::vector<T> result_cpu(td.num_tiles*output_length);
|
||||
hc::array<T> result(td.num_tiles*output_length);
|
||||
|
||||
auto fut = tile_for<T[]>(td, [=,&result](hc::tiled_index<1> t_idx, tile_buffer<T[]> buffer) [[hc]]
|
||||
{
|
||||
const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
const auto tile = t_idx.tile[0];
|
||||
|
||||
buffer.action_at(local, [&](T* state)
|
||||
{
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), state);
|
||||
invoke(make_rocm_invoke_fn<Tag>(f), t_idx, td, reduce_value(state, std::is_pointer<reference_type>()));
|
||||
});
|
||||
t_idx.barrier.wait();
|
||||
|
||||
// Reduce within a tile using multiple threads.
|
||||
// even though buffer.size is always 64, the value 64 must be hard coded below
|
||||
// due to a compiler bug
|
||||
// for(std::size_t s = 1; s < buffer.size(); s *= 2)
|
||||
for(std::size_t s = 1; s < 64; s *= 2)
|
||||
{
|
||||
const std::size_t index = 2 * s * local;
|
||||
// if (index < buffer.size())
|
||||
if (index < 64)
|
||||
{
|
||||
buffer.action_at(index, index + s, [&](T* x, T* y)
|
||||
{
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), x, y);
|
||||
});
|
||||
}
|
||||
t_idx.barrier.wait();
|
||||
}
|
||||
|
||||
// Store the tile result in the global memory.
|
||||
if (local == 0)
|
||||
{
|
||||
#if KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
// Workaround for assigning from LDS memory: std::copy should work
|
||||
// directly
|
||||
buffer.action_at(0, [&](T* x)
|
||||
{
|
||||
#if ROCM15
|
||||
// new ROCM 15 address space changes aren't implemented in std algorithms yet
|
||||
auto * src = reinterpret_cast<char *>(x);
|
||||
auto * dest = reinterpret_cast<char *>(result.data()+tile*output_length);
|
||||
for(int i=0; i<sizeof(T);i++) dest[i] = src[i];
|
||||
#else
|
||||
// Workaround: copy_if used to avoid memmove
|
||||
std::copy_if(x, x+output_length, result.data()+tile*output_length, always_true{} );
|
||||
#endif
|
||||
});
|
||||
#else
|
||||
std::copy(buffer, buffer+output_length, result.data()+tile*output_length);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
});
|
||||
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
|
||||
fut.wait();
|
||||
|
||||
copy(result,result_cpu.data());
|
||||
for(std::size_t i=0;i<td.num_tiles;i++)
|
||||
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
|
||||
|
||||
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
|
||||
|
||||
}
|
||||
|
||||
}} //end of namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if !defined( KOKKOS_ROCM_AMP_REDUCE_INL ) */
|
||||
|
||||
605
lib/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
Normal file
605
lib/kokkos/core/src/ROCm/Kokkos_ROCm_ReduceScan.hpp
Normal file
@ -0,0 +1,605 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_ROCM_REDUCESCAN_HPP
|
||||
#define KOKKOS_ROCM_REDUCESCAN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if ROCM is enabled for Kokkos */
|
||||
#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
|
||||
|
||||
//#include <utility>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl( T & out , T const & in , int lane ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl( *reinterpret_cast<int const *>(&in) , lane , width );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl( T & out , T const & in , int lane ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl_down( T & out , T const & in , int delta ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl_down( T & out , T const & in , int delta ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl_up( T & out , T const & in , int delta ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_shfl_up( T & out , T const & in , int delta ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Reduce within a workgroup over team.vector_length(), the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Requires:
|
||||
* team.vector_length() is power of 2
|
||||
* team.vector_length() <= 32 (one workgroup)
|
||||
*
|
||||
* Cannot use "butterfly" pattern because floating point
|
||||
* addition is non-associative. Therefore, must broadcast
|
||||
* the final result.
|
||||
*/
|
||||
template< class Reducer >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_intra_workgroup_vector_reduce( Reducer const & reducer )
|
||||
{
|
||||
static_assert(
|
||||
std::is_reference< typename Reducer::reference_type >::value , "" );
|
||||
|
||||
if ( 1 < team.vector_length() ) {
|
||||
|
||||
typename Reducer::value_type tmp ;
|
||||
|
||||
for ( int i = team.vector_length() ; ( i >>= 1 ) ; ) {
|
||||
|
||||
rocm_shfl_down( tmp , reducer.reference() , i , team.vector_length() );
|
||||
|
||||
if ( team.vector_rank() < i ) { reducer.join( reducer.data() , & tmp ); }
|
||||
}
|
||||
|
||||
// Broadcast from root "lane" to all other "lanes"
|
||||
|
||||
rocm_shfl( reducer.reference() , reducer.reference() , 0 , team.vector_length() );
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Inclusive scan over team.vector_length(), the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Algorithm is concurrent bottom-up reductions in triangular pattern
|
||||
* where each ROCM thread is the root of a reduction tree from the
|
||||
* zeroth ROCM thread to itself.
|
||||
*
|
||||
* Requires:
|
||||
* team.vector_length() is power of 2
|
||||
* team.vector_length() <= 32 (one workgroup)
|
||||
*/
|
||||
template< typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_intra_workgroup_vector_inclusive_scan( ValueType & local )
|
||||
{
|
||||
ValueType tmp ;
|
||||
|
||||
// Bottom up:
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int i = 1 ; i < team.vector_length() ; i <<= 1 ) {
|
||||
|
||||
rocm_shfl_up( tmp , local , i , team.vector_length() );
|
||||
|
||||
if ( i <= team.vector_rank() ) { local += tmp ; }
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) threads with same team.team_rank() have same value
|
||||
* (b) team.vector_length() == power of two
|
||||
* (c) blockDim.z == 1
|
||||
*/
|
||||
|
||||
template< class ValueType , class JoinOp>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_intra_workgroup_reduction( const ROCmTeamMember& team,
|
||||
ValueType& result,
|
||||
const JoinOp& join) {
|
||||
|
||||
unsigned int shift = 1;
|
||||
int max_active_thread = team.team_size();
|
||||
|
||||
//Reduce over values from threads with different team.team_rank()
|
||||
while(team.vector_length() * shift < 32 ) {
|
||||
const ValueType tmp = shfl_down(result, team.vector_length()*shift,32u);
|
||||
//Only join if upper thread is active (this allows non power of two for team.team_size()
|
||||
if(team.team_rank() + shift < max_active_thread)
|
||||
join(result , tmp);
|
||||
shift*=2;
|
||||
}
|
||||
|
||||
result = shfl(result,0,32);
|
||||
}
|
||||
|
||||
template< class ValueType , class JoinOp>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_inter_workgroup_reduction( const ROCmTeamMember& team,
|
||||
ValueType& value,
|
||||
const JoinOp& join) {
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
|
||||
tile_static ValueType sh_result[256];
|
||||
int max_active_thread = team.team_size();
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
const unsigned step = 256 / team.vector_length();
|
||||
unsigned shift = STEP_WIDTH;
|
||||
const int id = team.team_rank()%step==0?team.team_rank()/step:65000;
|
||||
if(id < STEP_WIDTH ) {
|
||||
result[id] = value;
|
||||
}
|
||||
team.team_barrier();
|
||||
|
||||
while (shift<=max_active_thread/step) {
|
||||
if(shift<=id && shift+STEP_WIDTH>id && team.vector_rank()==0) {
|
||||
join(result[id%STEP_WIDTH],value);
|
||||
}
|
||||
team.team_barrier();
|
||||
shift+=STEP_WIDTH;
|
||||
}
|
||||
|
||||
|
||||
value = result[0];
|
||||
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
|
||||
join(value,result[i]);
|
||||
}
|
||||
|
||||
#if 0
|
||||
template< class ValueType , class JoinOp>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_intra_block_reduction( ROCmTeamMember& team,
|
||||
ValueType& value,
|
||||
const JoinOp& join,
|
||||
const int max_active_thread) {
|
||||
rocm_intra_workgroup_reduction(team,value,join,max_active_thread);
|
||||
rocm_inter_workgroup_reduction(team,value,join,max_active_thread);
|
||||
}
|
||||
|
||||
template< class FunctorType , class JoinOp , class ArgTag = void >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool rocm_inter_block_reduction( ROCmTeamMember& team,
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value,
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral,
|
||||
const JoinOp& join,
|
||||
ROCm::size_type * const m_scratch_space,
|
||||
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
|
||||
ROCm::size_type * const m_scratch_flags,
|
||||
const int max_active_thread) {
|
||||
#ifdef __ROCM_ARCH__
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
|
||||
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
|
||||
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
rocm_intra_block_reduction(value,join,max_active_thread);
|
||||
|
||||
const unsigned id = team.team_rank()*team.vector_length() + team.vector_rank();
|
||||
|
||||
//One thread in the block writes block result to global scratch_memory
|
||||
if(id == 0 ) {
|
||||
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
|
||||
*global = value;
|
||||
}
|
||||
|
||||
//One workgroup of last block performs inter block reduction through loading the block values from global scratch_memory
|
||||
bool last_block = false;
|
||||
|
||||
team.team_barrier();
|
||||
if ( id < 32 ) {
|
||||
ROCm::size_type count;
|
||||
|
||||
//Figure out whether this is the last block
|
||||
if(id == 0)
|
||||
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
|
||||
count = Kokkos::shfl(count,0,32);
|
||||
|
||||
//Last block does the inter block reduction
|
||||
if( count == gridDim.x - 1) {
|
||||
//set flag back to zero
|
||||
if(id == 0)
|
||||
*m_scratch_flags = 0;
|
||||
last_block = true;
|
||||
value = neutral;
|
||||
|
||||
pointer_type const volatile global = (pointer_type) m_scratch_space ;
|
||||
|
||||
//Reduce all global values with splitting work over threads in one workgroup
|
||||
const int step_size = team.vector_length()*team.team_size() < 32 ? team.vector_length()*team.team_size() : 32;
|
||||
for(int i=id; i<gridDim.x; i+=step_size) {
|
||||
value_type tmp = global[i];
|
||||
join(value, tmp);
|
||||
}
|
||||
|
||||
//Perform shfl reductions within the workgroup only join if contribution is valid (allows gridDim.x non power of two and <32)
|
||||
if (team.vector_length()*team.team_size() > 1) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 1,32);
|
||||
if( id + 1 < gridDim.x )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (team.vector_length()*team.team_size() > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < gridDim.x )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (team.vector_length()*team.team_size() > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < gridDim.x )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (team.vector_length()*team.team_size() > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < gridDim.x )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (team.vector_length()*team.team_size() > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < gridDim.x )
|
||||
join(value, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
#if 0
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of ROCm C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
|
||||
// function qualifier which could be used to improve performance.
|
||||
//----------------------------------------------------------------------------
|
||||
// Maximize shared memory and minimize L1 cache:
|
||||
// rocmFuncSetCacheConfig(MyKernel, rocmFuncCachePreferShared );
|
||||
// For 2.0 capability: 48 KB shared and 16 KB L1
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) team.team_size() is a power of two
|
||||
* (b) team.team_size() <= 512
|
||||
* (c) team.vector_length() == blockDim.z == 1
|
||||
*/
|
||||
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_intra_block_reduce_scan( const FunctorType & functor ,
|
||||
const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
|
||||
{
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
|
||||
const unsigned value_count = ValueTraits::value_count( functor );
|
||||
const unsigned BlockSizeMask = team.team_size() - 1 ;
|
||||
|
||||
// Must have power of two thread count
|
||||
|
||||
if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_intra_block_scan requires power-of-two blockDim"); }
|
||||
|
||||
#define BLOCK_REDUCE_STEP( R , TD , S ) \
|
||||
if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
|
||||
|
||||
#define BLOCK_SCAN_STEP( TD , N , S ) \
|
||||
if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
|
||||
|
||||
const unsigned rtid_intra = team.team_rank() ^ BlockSizeMask ;
|
||||
const pointer_type tdata_intra = base_data + value_count * team.team_rank() ;
|
||||
|
||||
{ // Intra-workgroup reduction:
|
||||
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
|
||||
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
|
||||
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
|
||||
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
|
||||
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
|
||||
}
|
||||
|
||||
team.team_barrier(); // Wait for all workgroups to reduce
|
||||
|
||||
{ // Inter-workgroup reduce-scan by a single workgroup to avoid extra synchronizations
|
||||
const unsigned rtid_inter = ( team.team_rank() ^ BlockSizeMask ) << ROCmTraits::WarpIndexShift ;
|
||||
|
||||
if ( rtid_inter < team.team_size() ) {
|
||||
|
||||
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
|
||||
|
||||
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
|
||||
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
|
||||
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
|
||||
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
|
||||
|
||||
if ( DoScan ) {
|
||||
|
||||
int n = ( rtid_inter & 32 ) ? 32 : (
|
||||
( rtid_inter & 64 ) ? 64 : (
|
||||
( rtid_inter & 128 ) ? 128 : (
|
||||
( rtid_inter & 256 ) ? 256 : 0 )));
|
||||
|
||||
if ( ! ( rtid_inter + n < team.team_size() ) ) n = 0 ;
|
||||
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
|
||||
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
team.team_barrier(); // Wait for inter-workgroup reduce-scan to complete
|
||||
|
||||
if ( DoScan ) {
|
||||
int n = ( rtid_intra & 1 ) ? 1 : (
|
||||
( rtid_intra & 2 ) ? 2 : (
|
||||
( rtid_intra & 4 ) ? 4 : (
|
||||
( rtid_intra & 8 ) ? 8 : (
|
||||
( rtid_intra & 16 ) ? 16 : 0 ))));
|
||||
|
||||
if ( ! ( rtid_intra + n < team.team_size() ) ) n = 0 ;
|
||||
#ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) team.team_barrier();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) team.team_barrier();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) team.team_barrier();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) team.team_barrier();//__threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) team.team_barrier();
|
||||
#else
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
|
||||
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
|
||||
#endif
|
||||
}
|
||||
|
||||
#undef BLOCK_SCAN_STEP
|
||||
#undef BLOCK_REDUCE_STEP
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/**\brief Input value-per-thread starting at 'shared_data'.
|
||||
* Reduction value at last thread's location.
|
||||
*
|
||||
* If 'DoScan' then write blocks' scan values and block-groups' scan values.
|
||||
*
|
||||
* Global reduce result is in the last threads' 'shared_data' location.
|
||||
*/
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool rocm_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
const ROCm::size_type block_id ,
|
||||
const ROCm::size_type block_count ,
|
||||
ROCm::size_type * const shared_data ,
|
||||
ROCm::size_type * const global_data ,
|
||||
ROCm::size_type * const global_flags )
|
||||
{
|
||||
typedef ROCm::size_type size_type ;
|
||||
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
|
||||
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
|
||||
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
// '__ffs' = position of the least significant bit set to 1.
|
||||
// 'team.team_size()' is guaranteed to be a power of two so this
|
||||
// is the integral shift value that can replace an integral divide.
|
||||
const unsigned BlockSizeShift = __ffs( team.team_size() ) - 1 ;
|
||||
const unsigned BlockSizeMask = team.team_size() - 1 ;
|
||||
|
||||
// Must have power of two thread count
|
||||
if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_single_inter_block_reduce_scan requires power-of-two blockDim"); }
|
||||
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
|
||||
|
||||
// Reduce the accumulation for the entire block.
|
||||
rocm_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
|
||||
|
||||
{
|
||||
// Write accumulation total to global scratch space.
|
||||
// Accumulation total is the last thread's data.
|
||||
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
|
||||
size_type * const global = global_data + word_count.value * block_id ;
|
||||
|
||||
#if (__ROCM_ARCH__ < 500)
|
||||
for ( size_type i = team.team_rank() ; i < word_count.value ; i += team.team_size() ) { global[i] = shared[i] ; }
|
||||
#else
|
||||
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
|
||||
// If this block is not the last block to contribute to this group then the block is done.
|
||||
team.team_barrier();
|
||||
const bool is_last_block =
|
||||
! team.team_reduce( team.team_rank() ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) ,Impl::JoinAdd<ValueType>());
|
||||
|
||||
if ( is_last_block ) {
|
||||
|
||||
const size_type b = ( long(block_count) * long(team.team_rank()) ) >> BlockSizeShift ;
|
||||
const size_type e = ( long(block_count) * long( team.team_rank() + 1 ) ) >> BlockSizeShift ;
|
||||
|
||||
{
|
||||
void * const shared_ptr = shared_data + word_count.value * team.team_rank() ;
|
||||
reference_type shared_value = ValueInit::init( functor , shared_ptr );
|
||||
|
||||
for ( size_type i = b ; i < e ; ++i ) {
|
||||
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
|
||||
}
|
||||
}
|
||||
|
||||
rocm_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
|
||||
|
||||
if ( DoScan ) {
|
||||
|
||||
size_type * const shared_value = shared_data + word_count.value * ( team.team_rank() ? team.team_rank() - 1 : team.team_size() );
|
||||
|
||||
if ( ! team.team_rank() ) { ValueInit::init( functor , shared_value ); }
|
||||
|
||||
// Join previous inclusive scan value to each member
|
||||
for ( size_type i = b ; i < e ; ++i ) {
|
||||
size_type * const global_value = global_data + word_count.value * i ;
|
||||
ValueJoin::join( functor , shared_value , global_value );
|
||||
ValueOps ::copy( functor , global_value , shared_value );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return is_last_block ;
|
||||
}
|
||||
|
||||
// Size in bytes required for inter block reduce or scan
|
||||
template< bool DoScan , class FunctorType , class ArgTag >
|
||||
inline
|
||||
unsigned rocm_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
|
||||
{
|
||||
return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( __ROCMCC__ ) */
|
||||
#endif /* KOKKOS_ROCM_REDUCESCAN_HPP */
|
||||
|
||||
157
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
Normal file
157
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
Normal file
@ -0,0 +1,157 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <ROCm/Kokkos_ROCm_Invoke.hpp>
|
||||
#include <ROCm/Kokkos_ROCm_Join.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class Tag, class F, class TransformIndex>
|
||||
void scan_enqueue(
|
||||
const int len,
|
||||
const F & f,
|
||||
TransformIndex transform_index)
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueTraits< F, Tag> ValueTraits;
|
||||
typedef Kokkos::Impl::FunctorValueInit< F, Tag> ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< F, Tag> ValueJoin;
|
||||
typedef Kokkos::Impl::FunctorValueOps< F, Tag> ValueOps;
|
||||
|
||||
typedef typename ValueTraits::value_type value_type;
|
||||
typedef typename ValueTraits::pointer_type pointer_type;
|
||||
typedef typename ValueTraits::reference_type reference_type;
|
||||
|
||||
const auto td = get_tile_desc<value_type>(len);
|
||||
std::vector<value_type> result_cpu(td.num_tiles);
|
||||
hc::array<value_type> result(td.num_tiles);
|
||||
hc::array<value_type> scratch(len);
|
||||
|
||||
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
|
||||
{
|
||||
const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
const auto tile = t_idx.tile[0];
|
||||
|
||||
// Join tile buffer elements
|
||||
const auto join = [&](std::size_t i, std::size_t j)
|
||||
{
|
||||
buffer.action_at(i, j, [&](value_type& x, const value_type& y)
|
||||
{
|
||||
ValueJoin::join(f, &x, &y);
|
||||
});
|
||||
};
|
||||
|
||||
// Copy into tile
|
||||
buffer.action_at(local, [&](value_type& state)
|
||||
{
|
||||
ValueInit::init(f, &state);
|
||||
if (global < len) rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), state, false);
|
||||
});
|
||||
t_idx.barrier.wait();
|
||||
// Up sweep phase
|
||||
for(std::size_t d=1;d<buffer.size();d*=2)
|
||||
{
|
||||
auto d2 = 2*d;
|
||||
auto i = local*d2;
|
||||
if(i<len)
|
||||
{
|
||||
auto j = i + d - 1;
|
||||
auto k = i + d2 - 1;
|
||||
// join(k, j); // no longer needed with ROCm 1.6
|
||||
ValueJoin::join(f, &buffer[k], &buffer[j]);
|
||||
}
|
||||
}
|
||||
t_idx.barrier.wait();
|
||||
|
||||
result[tile] = buffer[buffer.size()-1];
|
||||
buffer[buffer.size()-1] = 0;
|
||||
// Down sweep phase
|
||||
for(std::size_t d=buffer.size()/2;d>0;d/=2)
|
||||
{
|
||||
auto d2 = 2*d;
|
||||
auto i = local*d2;
|
||||
if(i<len)
|
||||
{
|
||||
auto j = i + d - 1;
|
||||
auto k = i + d2 - 1;
|
||||
auto t = buffer[k];
|
||||
// join(k, j); // no longer needed with ROCm 1.6
|
||||
ValueJoin::join(f, &buffer[k], &buffer[j]);
|
||||
buffer[j] = t;
|
||||
}
|
||||
t_idx.barrier.wait();
|
||||
}
|
||||
// Copy tiles into global memory
|
||||
if (global < len) scratch[global] = buffer[local];
|
||||
}).wait();
|
||||
copy(result,result_cpu.data());
|
||||
|
||||
// The std::partial_sum was segfaulting, despite that this is cpu code.
|
||||
// if(td.num_tiles>1)
|
||||
// std::partial_sum(result_cpu.data(), result_cpu.data()+(td.num_tiles-1)*sizeof(value_type), result_cpu.data(), make_join_operator<ValueJoin>(f));
|
||||
// use this implementation instead.
|
||||
for(int i=1; i<td.num_tiles; i++)
|
||||
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
|
||||
|
||||
copy(result_cpu.data(),result);
|
||||
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
|
||||
{
|
||||
// const auto local = t_idx.local[0];
|
||||
const auto global = t_idx.global[0];
|
||||
const auto tile = t_idx.tile[0];
|
||||
|
||||
if (global < len)
|
||||
{
|
||||
auto final_state = scratch[global];
|
||||
|
||||
// the join is locking up, at least with 1.6
|
||||
if (tile != 0) final_state += result[tile-1];
|
||||
// if (tile != 0) ValueJoin::join(f, &final_state, &result[tile-1]);
|
||||
rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), final_state, true);
|
||||
}
|
||||
}).wait();
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
726
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
Normal file
726
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Space.cpp
Normal file
@ -0,0 +1,726 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if ROCM is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_ROCm.hpp>
|
||||
#include <Kokkos_ROCmSpace.hpp>
|
||||
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#define ROCM_SAFE_CALL
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
using namespace hc;
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
|
||||
DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
|
||||
DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
|
||||
{
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
av.copy( src , dst , n);
|
||||
}
|
||||
|
||||
|
||||
hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n) {
|
||||
hc::accelerator acc;
|
||||
hc::accelerator_view av = acc.get_default_view();
|
||||
return(av.copy_async( src , dst , n));
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
void Experimental::ROCmSpace::access_error()
|
||||
{
|
||||
const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
void Experimental::ROCmSpace::access_error( const void * const )
|
||||
{
|
||||
const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
ROCmSpace::ROCmSpace()
|
||||
: m_device( ROCm().rocm_device() )
|
||||
{
|
||||
}
|
||||
|
||||
ROCmHostPinnedSpace::ROCmHostPinnedSpace()
|
||||
{
|
||||
}
|
||||
|
||||
void * ROCmSpace::allocate( const size_t arg_alloc_size ) const
|
||||
{
|
||||
void * ptr = Kokkos::Impl::rocm_device_allocate( arg_alloc_size );
|
||||
return ptr ;
|
||||
}
|
||||
|
||||
void * Experimental::ROCmHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
|
||||
{
|
||||
void * ptr = Kokkos::Impl::rocm_hostpinned_allocate( arg_alloc_size );
|
||||
return ptr ;
|
||||
}
|
||||
|
||||
void ROCmSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
|
||||
{
|
||||
Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
|
||||
}
|
||||
|
||||
void Experimental::ROCmHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
|
||||
{
|
||||
Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
SharedAllocationRecord< void , void >
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record ;
|
||||
|
||||
SharedAllocationRecord< void , void >
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record ;
|
||||
|
||||
|
||||
std::string
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_label() const
|
||||
{
|
||||
SharedAllocationHeader header ;
|
||||
|
||||
Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
|
||||
|
||||
return std::string( header.m_label );
|
||||
}
|
||||
|
||||
std::string
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_label() const
|
||||
{
|
||||
return std::string( RecordBase::head()->m_label );
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
allocate( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
)
|
||||
{
|
||||
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
allocate( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
)
|
||||
{
|
||||
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
|
||||
}
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
{
|
||||
delete static_cast<SharedAllocationRecord*>(arg_rec);
|
||||
}
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
{
|
||||
delete static_cast<SharedAllocationRecord*>(arg_rec);
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
|
||||
SharedAllocationHeader header ;
|
||||
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
|
||||
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmSpace::name()),header.m_label,
|
||||
data(),size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
data(),size());
|
||||
}
|
||||
#endif
|
||||
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
|
||||
)
|
||||
// Pass through allocated [ SharedAllocationHeader , user_memory ]
|
||||
// Pass through deallocation function
|
||||
: SharedAllocationRecord< void , void >
|
||||
( & SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record
|
||||
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
|
||||
, sizeof(SharedAllocationHeader) + arg_alloc_size
|
||||
, arg_dealloc
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
SharedAllocationHeader header ;
|
||||
|
||||
// Fill in the Header information
|
||||
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
|
||||
|
||||
strncpy( header.m_label
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
|
||||
// Copy to device memory
|
||||
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
|
||||
)
|
||||
// Pass through allocated [ SharedAllocationHeader , user_memory ]
|
||||
// Pass through deallocation function
|
||||
: SharedAllocationRecord< void , void >
|
||||
( & SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record
|
||||
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
|
||||
, sizeof(SharedAllocationHeader) + arg_alloc_size
|
||||
, arg_dealloc
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
#endif
|
||||
// Fill in the Header information, directly accessible via host pinned memory
|
||||
|
||||
RecordBase::m_alloc_ptr->m_record = this ;
|
||||
|
||||
strncpy( RecordBase::m_alloc_ptr->m_label
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
#if 0
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<Experimental::ROCmHostPinnedSpace,Experimental::ROCmHostPinnedSpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordBase = SharedAllocationRecord< void , void > ;
|
||||
using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
|
||||
|
||||
#if 0
|
||||
// Copy the header from the allocation
|
||||
Header head ;
|
||||
|
||||
Header const * const head_rocm = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
|
||||
|
||||
if ( alloc_ptr ) {
|
||||
Kokkos::Impl::DeepCopy<HostSpace,Experimental::ROCmSpace>( & head , head_rocm , sizeof(SharedAllocationHeader) );
|
||||
}
|
||||
|
||||
RecordROCm * const record = alloc_ptr ? static_cast< RecordROCm * >( head.m_record ) : (RecordROCm *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || record->m_alloc_ptr != head_rocm ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// Iterate the list to search for the record among all allocations
|
||||
// requires obtaining the root of the list and then locking the list.
|
||||
|
||||
RecordROCm * const record = static_cast< RecordROCm * >( RecordBase::find( & s_root_record , alloc_ptr ) );
|
||||
|
||||
if ( record == 0 ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return record ;
|
||||
}
|
||||
|
||||
#if 0
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > ;
|
||||
|
||||
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
return static_cast< RecordROCm * >( h->m_record );
|
||||
}
|
||||
#endif
|
||||
|
||||
// Iterate records to print orphaned memory ...
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::Experimental::ROCmSpace & space , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void > * r = & s_root_record ;
|
||||
|
||||
char buffer[256] ;
|
||||
|
||||
SharedAllocationHeader head ;
|
||||
|
||||
if ( detail ) {
|
||||
do {
|
||||
if ( r->m_alloc_ptr ) {
|
||||
Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
|
||||
}
|
||||
else {
|
||||
head.m_label[0] = 0 ;
|
||||
}
|
||||
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "ROCm addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "ROCm addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, reinterpret_cast<uintptr_t>( r )
|
||||
, reinterpret_cast<uintptr_t>( r->m_prev )
|
||||
, reinterpret_cast<uintptr_t>( r->m_next )
|
||||
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
|
||||
, r->m_alloc_size
|
||||
, r->m_count
|
||||
, reinterpret_cast<uintptr_t>( r->m_dealloc )
|
||||
, head.m_label
|
||||
);
|
||||
std::cout << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != & s_root_record );
|
||||
}
|
||||
else {
|
||||
do {
|
||||
if ( r->m_alloc_ptr ) {
|
||||
|
||||
Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
|
||||
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "ROCm [ 0x%.12lx + %ld ] %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "ROCm [ 0x%.12llx + %ld ] %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, reinterpret_cast< uintptr_t >( r->data() )
|
||||
, r->size()
|
||||
, head.m_label
|
||||
);
|
||||
}
|
||||
else {
|
||||
snprintf( buffer , 256 , "ROCm [ 0 + 0 ]\n" );
|
||||
}
|
||||
std::cout << buffer ;
|
||||
r = r->m_next ;
|
||||
} while ( r != & s_root_record );
|
||||
}
|
||||
}
|
||||
#if 0
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::Experimental::ROCmHostPinnedSpace & space , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "ROCmHostPinned" , & s_root_record , detail );
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
#if 0
|
||||
KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_atomic() {
|
||||
unsigned i = tindex()*team_size() + lindex();
|
||||
|
||||
if(i<ROCM_SPACE_ATOMIC_MASK+1)
|
||||
kokkos_impl_rocm_lock_arrays.atomic[i] = 0;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_scratch_threadid(int N) {
|
||||
unsigned i = tindex()*team_size() + lindex();
|
||||
|
||||
if(i<N) {
|
||||
kokkos_impl_rocm_lock_arrays.scratch[i] = 0;
|
||||
kokkos_impl_rocm_lock_arrays.threadid[i] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
namespace Impl {
|
||||
int* atomic_lock_array_rocm_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
rocmFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
rocmMalloc(&ptr,sizeof(int)*(ROCM_SPACE_ATOMIC_MASK+1));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* scratch_lock_array_rocm_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
rocmFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
int* threadid_lock_array_rocm_space_ptr(bool deallocate) {
|
||||
static int* ptr = NULL;
|
||||
if(deallocate) {
|
||||
rocmFree(ptr);
|
||||
ptr = NULL;
|
||||
}
|
||||
|
||||
if(ptr==NULL && !deallocate)
|
||||
rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void init_lock_arrays_rocm_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized) {
|
||||
Kokkos::Impl::ROCmLockArraysStruct locks;
|
||||
locks.atomic = atomic_lock_array_rocm_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_rocm_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_rocm_space_ptr(false);
|
||||
am_copyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
|
||||
init_lock_array_kernel_atomic<<<(ROCM_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Experimental::ROCm::concurrency()+255)/256,256>>>(Kokkos::Experimental::ROCm::concurrency());
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void* rocm_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static size_t current_size = 0;
|
||||
if(current_size == 0) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
|
||||
}
|
||||
if(bytes > current_size) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::ROCmSpace>(ptr,current_size);
|
||||
}
|
||||
if((bytes < current_size) && (force_shrink)) {
|
||||
current_size = bytes;
|
||||
Kokkos::kokkos_free<Kokkos::Experimental::ROCmSpace>(ptr);
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif // KOKKOS_ENABLE_ROCM
|
||||
|
||||
174
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
Normal file
174
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.cpp
Normal file
@ -0,0 +1,174 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Experimental::ROCm > ;
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver
|
||||
( TaskQueueSpecialization< Kokkos::Experimental::ROCm >::queue_type * const queue,
|
||||
hc::tiled_index<3> threadIdx )
|
||||
{
|
||||
using Member = TaskExec< Kokkos::Experimental::ROCm > ;
|
||||
using Queue = TaskQueue< Kokkos::Experimental::ROCm > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec( 1, threadIdx );
|
||||
Member team_exec( threadIdx.tile_dim[0], threadIdx );
|
||||
|
||||
const int wavefront_lane = threadIdx.local[0] + threadIdx.local[1]* threadIdx.tile_dim[0] ;
|
||||
|
||||
union {
|
||||
task_root_type * ptr ;
|
||||
int raw[2] ;
|
||||
} task ;
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or collection of single thread tasks for the team.
|
||||
|
||||
if ( 0 == wavefront_lane ) {
|
||||
|
||||
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
|
||||
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
printf("TaskQueue<ROCm>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
, uintptr_t(task.ptr));
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
// shuffle broadcast
|
||||
|
||||
task.raw[0] = hc::__shfl( task.raw[0] , 0 );
|
||||
task.raw[1] = hc::__shfl( task.raw[1] , 0 );
|
||||
|
||||
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
|
||||
|
||||
if ( end != task.ptr ) {
|
||||
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task.ptr->m_apply)( task.ptr , & team_exec );
|
||||
}
|
||||
else if ( 0 == threadIdx.local[1] ) {
|
||||
// Single Thread Task
|
||||
(*task.ptr->m_apply)( task.ptr , & single_exec );
|
||||
}
|
||||
|
||||
if ( 0 == wavefront_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
#if 0
|
||||
namespace {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void rocm_task_queue_execute( TaskQueue< Kokkos::Experimental::ROCm > * queue,
|
||||
hc::tiled_index<3> threadIdx )
|
||||
{ TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue, threadIdx ); }
|
||||
|
||||
}
|
||||
#endif
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::execute
|
||||
( TaskQueue< Kokkos::Experimental::ROCm > * const queue )
|
||||
{
|
||||
const int workgroups_per_wavefront = 4 ;
|
||||
const int wavefront_size = Kokkos::Impl::ROCmTraits::WavefrontSize ;
|
||||
const int cu_count = Kokkos::Impl::rocm_internal_cu_count();
|
||||
// const dim3 grid( Kokkos::Impl::rocm_internal_cu_count() , 1 , 1 );
|
||||
// const dim3 block( 1 , Kokkos::Impl::ROCmTraits::WorkGroupSize , workgroups_per_wavefront );
|
||||
|
||||
|
||||
|
||||
// Query the stack size, in bytes:
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
|
||||
// adapted from the cuda code. TODO: Not at all sure that this is the proper
|
||||
// to map the cuda grid/blocks/3D tiling to HCC
|
||||
#if 0
|
||||
hc::extent< 3 > flat_extent( cu_count,
|
||||
wavefront_size, workgroups_per_wavefront );
|
||||
hc::tiled_extent< 3 > team_extent = flat_extent.tile(1,
|
||||
wavefront_size,workgroups_per_wavefront);
|
||||
|
||||
hc::parallel_for_each( team_extent , [&](hc::tiled_index<3> idx) [[hc]]
|
||||
{
|
||||
TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue,idx );
|
||||
}).wait();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
458
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
Normal file
458
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Task.hpp
Normal file
@ -0,0 +1,458 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_ROCM_TASK_HPP
|
||||
#define KOKKOS_IMPL_ROCM_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class > class TaskExec ;
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Experimental::ROCm >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Experimental::ROCm ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using member_type = TaskExec< execution_space > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void driver( queue_type * const, hc::tiled_index<3> );
|
||||
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( typename TaskBase< Kokkos::Experimental::ROCm
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
>::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::Experimental::ROCm
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
hc::extent< 1 > flat_extent( 1 );
|
||||
hc::tiled_extent< 1 > team_extent = flat_extent.tile( 1);
|
||||
|
||||
hc::parallel_for_each( team_extent , [&](hc::tiled_index<1> idx) [[hc]]
|
||||
{
|
||||
*ptr = TaskType::apply ;
|
||||
}).wait();
|
||||
}
|
||||
};
|
||||
|
||||
/*template<>
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue<Kokkos::Experimental::ROCm>::decrement( typename TaskQueue<Kokkos::Experimental::ROCm>::task_root_type *
|
||||
) {}
|
||||
*/
|
||||
extern template class TaskQueue< Kokkos::Experimental::ROCm > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/**\brief Impl::TaskExec<ROCm> is the TaskScheduler<ROCm>::member_type
|
||||
* passed to tasks running in a ROCm space.
|
||||
*
|
||||
* ROCm thread blocks for tasking are dimensioned:
|
||||
* idx.tile_dim[0] == vector length
|
||||
* idx.tile_dim[1] == team size
|
||||
* idx.tile_dim[2] == number of teams
|
||||
* where
|
||||
* idx.tile_dim[0] * idx.tile_dim[1] == WavefrontSize
|
||||
*
|
||||
* Both single thread and thread team tasks are run by a full ROCm warp.
|
||||
* A single thread task is called by warp lane #0 and the remaining
|
||||
* lanes of the warp are idle.
|
||||
*/
|
||||
template<>
|
||||
class TaskExec< Kokkos::Experimental::ROCm >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::ROCm > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::ROCm > ;
|
||||
|
||||
int m_team_size ;
|
||||
hc::tiled_index<3> m_idx;
|
||||
|
||||
// KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size ) //TODO: tile_dim[0]
|
||||
// : m_team_size( arg_team_size ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size,
|
||||
hc::tiled_index<3> tidx)
|
||||
: m_team_size( arg_team_size),
|
||||
m_idx( tidx ) {}
|
||||
|
||||
public:
|
||||
// const auto local = t_idx.local[0];
|
||||
// const auto global = t_idx.global[0];
|
||||
// const auto tile = t_idx.tile[0];
|
||||
|
||||
hc::tiled_index<3> idx() const { return m_idx;}
|
||||
|
||||
#if defined( __HCC_ACCELERATOR__ )
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() { /* __threadfence_block(); */ }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_idx.local[1] ; } // t_idx.tile[0];
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() {}
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return 0 ; }
|
||||
#endif
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
|
||||
TeamThreadRange
|
||||
( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType1, typename iType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::TaskExec< Kokkos::Experimental::ROCm > >
|
||||
TeamThreadRange
|
||||
( Impl:: TaskExec< Kokkos::Experimental::ROCm > & thread, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type<iType1, iType2>::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread, begin, end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
|
||||
ThreadVectorRange
|
||||
( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
// reduce across corresponding lanes between team members within workgroup
|
||||
// assume stride*team_size == workgroup_size
|
||||
template< typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void strided_shfl_workgroup_reduction
|
||||
(const ValueType& f(),
|
||||
ValueType& val,
|
||||
int team_size,
|
||||
int stride)
|
||||
{
|
||||
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
|
||||
f(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
|
||||
}
|
||||
}
|
||||
|
||||
template< typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void strided_shfl_workgroup_reduction
|
||||
(const JoinType& join,
|
||||
ValueType& val,
|
||||
int team_size,
|
||||
int stride)
|
||||
{
|
||||
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
|
||||
join(val, shfl_down(val, lane_delta, team_size*stride));
|
||||
}
|
||||
}
|
||||
|
||||
// multiple within-workgroup non-strided reductions
|
||||
template< typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void multi_shfl_workgroup_reduction
|
||||
(const JoinType& join,
|
||||
ValueType& val,
|
||||
int vec_length)
|
||||
{
|
||||
for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
|
||||
join(val, shfl_down(val, lane_delta, vec_length));
|
||||
}
|
||||
}
|
||||
|
||||
// broadcast within workgroup
|
||||
template< class ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType shfl_workgroup_broadcast
|
||||
(ValueType& val,
|
||||
int src_lane,
|
||||
int width)
|
||||
{
|
||||
return shfl(val, src_lane, width);
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within workgroup
|
||||
// assume vec_length*team_size == workgroup_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
|
||||
template<typename iType, class Lambda, typename ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
, ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
initialized_result = result;
|
||||
|
||||
strided_shfl_workgroup_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
|
||||
initialized_result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
idx.tile_dim[0]);
|
||||
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
|
||||
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
strided_shfl_workgroup_reduction<ValueType, JoinType>(
|
||||
join,
|
||||
initialized_result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
idx.tile_dim[0]);
|
||||
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
ValueType result = initialized_result;
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
initialized_result = result;
|
||||
|
||||
//initialized_result = multi_shfl_workgroup_reduction(
|
||||
multi_shfl_workgroup_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
|
||||
initialized_result,
|
||||
idx.tile_dim[0]);
|
||||
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
ValueType result = initialized_result;
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
initialized_result = result;
|
||||
|
||||
multi_shfl_workgroup_reduction<ValueType, JoinType>(join, initialized_result, idx.tile_dim[0]);
|
||||
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
|
||||
// intra-idx.tile_dim[0] exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
|
||||
// INCLUSIVE scan
|
||||
for( int offset = idx.tile_dim[0] ; offset < Impl::ROCmTraits::WavefrontSize ; offset <<= 1 ) {
|
||||
y = shfl_up(val, offset, Impl::ROCmTraits::WavefrontSize);
|
||||
if(idx.local[1]*idx.tile_dim[0] >= offset) { val += y; }
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_workgroup_broadcast<ValueType>(val,
|
||||
idx.local[0]+Impl::ROCmTraits::WavefrontSize-idx.tile_dim[0],
|
||||
Impl::ROCmTraits::WavefrontSize);
|
||||
|
||||
// make EXCLUSIVE scan by shifting values over one
|
||||
val = shfl_up(val, idx.tile_dim[0], Impl::ROCmTraits::WavefrontSize);
|
||||
if ( idx.local[1] == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
|
||||
// intra-idx.tile_dim[0] exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
|
||||
// INCLUSIVE scan
|
||||
for( int offset = 1 ; offset < idx.tile_dim[0] ; offset <<= 1 ) {
|
||||
y = shfl_up(val, offset, idx.tile_dim[0]);
|
||||
if(idx.local[0] >= offset) { val += y; }
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_workgroup_broadcast<ValueType>(val, idx.tile_dim[0]-1,
|
||||
idx.tile_dim[0]);
|
||||
|
||||
// make EXCLUSIVE scan by shifting values over one
|
||||
val = shfl_up(val, 1, idx.tile_dim[0]);
|
||||
if ( idx.local[0] == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} /* namespace Kokkos */
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_ROCM_TASK_HPP */
|
||||
|
||||
518
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
Normal file
518
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Tile.hpp
Normal file
@ -0,0 +1,518 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <hc.hpp>
|
||||
#include <type_traits>
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include <ROCm/Kokkos_ROCm_Config.hpp>
|
||||
|
||||
#if !defined( KOKKOS_ROCM_TILE_H )
|
||||
#define KOKKOS_ROCM_TILE_H
|
||||
|
||||
// Macro to abstract out the enable_if craziness
|
||||
#define KOKKOS_ROCM_REQUIRES(...) \
|
||||
bool KokkosROCmRequiresBool ## __LINE__ = true, typename std::enable_if<KokkosROCmRequiresBool ## __LINE__ && (__VA_ARGS__), int>::type = 0
|
||||
|
||||
// This number uniquely identifies the 1.5 release build.
|
||||
#if __hcc_workweek__ > 17160
|
||||
#define ROCM15 1
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class T>
|
||||
|
||||
#if defined(ROCM15)
|
||||
using lds_t = T;
|
||||
#else
|
||||
// prior to 1.5, needed to decorate LDS addresses
|
||||
using lds_t = __attribute__((address_space(3))) T;
|
||||
#endif
|
||||
|
||||
#define KOKKOS_ROCM_TILE_RESTRIC_CPU restrict(cpu, amp)
|
||||
|
||||
// a set of routines to the replace the std::routines
|
||||
// that will operate on address space 3 types
|
||||
|
||||
#if defined(ROCM15)
|
||||
// 1.5 can't use std::copy et al for LDS access, so we define our own
|
||||
// set of routines
|
||||
template<class I, class O>
|
||||
void rcopy(I first, I last, O out) [[hc]]
|
||||
{
|
||||
while (first != last) *out++ = *first++;
|
||||
}
|
||||
template<class I,class F>
|
||||
void rfor_each(I first, I last, F f) [[hc]]
|
||||
{
|
||||
for(;first!=last;++first) f(*first);
|
||||
}
|
||||
|
||||
template<class I,class O,class F>
|
||||
void rtransform(I first, I last, O out, F f) [[hc]]
|
||||
{
|
||||
while(first!=last) *out++ = f(*first++);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
inline std::size_t get_max_tile_size() KOKKOS_ROCM_TILE_RESTRIC_CPU
|
||||
{
|
||||
return hc::accelerator().get_max_tile_static_size() - 1024;
|
||||
}
|
||||
|
||||
inline std::size_t get_max_tile_thread() KOKKOS_ROCM_TILE_RESTRIC_CPU
|
||||
{
|
||||
return 64;
|
||||
}
|
||||
|
||||
inline int next_pow_2(int x) restrict(cpu, amp)
|
||||
{
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return x+1;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline std::size_t get_tile_size(std::size_t n = 1,
|
||||
std::size_t team = 64,
|
||||
std::size_t vector = 1)
|
||||
KOKKOS_ROCM_TILE_RESTRIC_CPU
|
||||
{
|
||||
const auto size = sizeof(T) * n;
|
||||
const auto group_size = get_max_tile_size();
|
||||
if (size == 0 || size > group_size) return 0;
|
||||
// Assume that thread size is a power of 2
|
||||
auto thread_size = std::min(team*vector,4*get_max_tile_thread());
|
||||
// ensure that we have enough tile static memory to keep
|
||||
// threadsize * size elements for reductions
|
||||
while(size > (group_size / thread_size) && thread_size > 2)
|
||||
{ thread_size /= 2;
|
||||
}
|
||||
return thread_size;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
struct array_view
|
||||
{
|
||||
T* x;
|
||||
std::size_t n;
|
||||
|
||||
array_view(T* xp, std::size_t np) [[hc]] [[cpu]]
|
||||
: x(xp), n(np)
|
||||
{}
|
||||
|
||||
array_view(T* xp, T* yp) [[hc]] [[cpu]]
|
||||
: x(xp), n(yp-xp)
|
||||
{}
|
||||
|
||||
T& operator[](std::size_t i) const [[hc]] [[cpu]]
|
||||
{
|
||||
return x[i];
|
||||
}
|
||||
|
||||
std::size_t size() const [[hc]] [[cpu]]
|
||||
{
|
||||
return this->n;
|
||||
}
|
||||
|
||||
T* data() const [[hc]] [[cpu]]
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
T* begin() const [[hc]] [[cpu]]
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
T* end() const [[hc]] [[cpu]]
|
||||
{
|
||||
return x+this->size();
|
||||
}
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct rocm_char
|
||||
{ using type=char; };
|
||||
|
||||
template<class T>
|
||||
struct rocm_char<const T>
|
||||
: std::add_const<typename rocm_char<T>::type>
|
||||
{};
|
||||
#if !defined(ROCM15)
|
||||
// earlier compilers required explicit address space decorations
|
||||
template<class T>
|
||||
struct rocm_char<__attribute__((address_space(3))) T>
|
||||
{ using type = __attribute__((address_space(3))) typename rocm_char<T>::type; };
|
||||
|
||||
template<class T>
|
||||
struct rocm_char<const __attribute__((address_space(3))) T>
|
||||
{ using type = const __attribute__((address_space(3))) typename rocm_char<T>::type; };
|
||||
#endif
|
||||
|
||||
template<class T, class Char=typename rocm_char<T>::type>
|
||||
Char* rocm_byte_cast(T& x) restrict(cpu, amp)
|
||||
{
|
||||
return reinterpret_cast<Char*>(&x);
|
||||
}
|
||||
|
||||
template<class T, class U>
|
||||
void rocm_raw_assign(T& x, const U& y) restrict(cpu, amp)
|
||||
{
|
||||
auto * src = rocm_byte_cast(y);
|
||||
auto * dest = rocm_byte_cast(x);
|
||||
#if defined (ROCM15)
|
||||
rcopy(src, src+sizeof(T), dest);
|
||||
#else
|
||||
std::copy(src, src+sizeof(T), dest);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class T, class U>
|
||||
void rocm_assign_impl(T& x, const U& y, std::true_type) restrict(cpu, amp)
|
||||
{
|
||||
rocm_raw_assign(x, y);
|
||||
}
|
||||
|
||||
template<class T, class U>
|
||||
void rocm_assign_impl(T& x, const U& y, std::false_type) restrict(cpu, amp)
|
||||
{
|
||||
x = y;
|
||||
}
|
||||
|
||||
// Workaround for assigning in and out of LDS memory
|
||||
template<class T, class U>
|
||||
void rocm_assign(T& x, const U& y) restrict(cpu, amp)
|
||||
{
|
||||
rocm_assign_impl(x, y, std::integral_constant<bool, (
|
||||
sizeof(T) == sizeof(U)
|
||||
)>());
|
||||
}
|
||||
|
||||
// Compute the address space of tile
|
||||
template<class T>
|
||||
struct tile_type
|
||||
{
|
||||
#if defined (ROCM15)
|
||||
typedef T type;
|
||||
#else
|
||||
typedef __attribute__((address_space(3))) T type;
|
||||
#endif
|
||||
};
|
||||
|
||||
#if !defined (ROCM15)
|
||||
template<class T, class Body>
|
||||
void lds_for(__attribute__((address_space(3))) T& value, Body b) [[hc]]
|
||||
{
|
||||
T state = value;
|
||||
b(state);
|
||||
value = state;
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class T, class Body>
|
||||
void lds_for(T& value, Body b) [[hc]]
|
||||
{
|
||||
b(value);
|
||||
}
|
||||
|
||||
|
||||
constexpr std::size_t get_max_tile_array_size()
|
||||
{
|
||||
return 24;
|
||||
}
|
||||
|
||||
template<class Derived, class T>
|
||||
struct single_action
|
||||
{
|
||||
template<class Action>
|
||||
void action_at(std::size_t i, Action a) [[hc]]
|
||||
{
|
||||
auto& value = static_cast<Derived&>(*this)[i];
|
||||
#if KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
T state = value;
|
||||
a(state);
|
||||
value = state;
|
||||
#else
|
||||
a(value);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
|
||||
{
|
||||
static_cast<Derived&>(*this).action_at(i, [&](T& x)
|
||||
{
|
||||
static_cast<Derived&>(*this).action_at(j, [&](T& y)
|
||||
{
|
||||
a(x, y);
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct tile_buffer
|
||||
: array_view<typename tile_type<T>::type>, single_action<tile_buffer<T>, T>
|
||||
{
|
||||
typedef typename tile_type<T>::type element_type;
|
||||
typedef array_view<element_type> base;
|
||||
|
||||
using base::base;
|
||||
|
||||
tile_buffer(element_type* xp, std::size_t np, std::size_t) [[hc]] [[cpu]]
|
||||
: base(xp, np)
|
||||
{}
|
||||
|
||||
tile_buffer(T* xp, T* yp, std::size_t) [[hc]] [[cpu]]
|
||||
: base(xp, yp)
|
||||
{}
|
||||
};
|
||||
|
||||
template<class T>
|
||||
struct tile_buffer<T[]>
|
||||
{
|
||||
typedef typename tile_type<T>::type element_type;
|
||||
typedef typename tile_type<char>::type tchar_type;
|
||||
element_type* element_data;
|
||||
std::size_t n, m;
|
||||
|
||||
tile_buffer(element_type* xp, std::size_t np, std::size_t mp) [[hc]] [[cpu]]
|
||||
: element_data(xp), n(np), m(mp)
|
||||
{}
|
||||
|
||||
tile_buffer(element_type* xp, element_type* yp, std::size_t mp) [[hc]] [[cpu]]
|
||||
: element_data(xp), n(yp-xp), m(mp)
|
||||
{}
|
||||
|
||||
element_type* operator[](std::size_t i) const [[hc]] [[cpu]]
|
||||
{
|
||||
return element_data+i*m;
|
||||
}
|
||||
|
||||
template<class Action, class Q = T>
|
||||
typename Impl::enable_if< (sizeof(Q) <= 8) , void >::type
|
||||
action_at(std::size_t i, Action a) [[hc]]
|
||||
{
|
||||
element_type* value = (*this)[i];
|
||||
#if defined (ROCM15)
|
||||
a(value);
|
||||
#else
|
||||
#if KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
if (m > get_max_tile_array_size()) return;
|
||||
T state[get_max_tile_array_size()];
|
||||
// std::copy(value, value+m, state);
|
||||
// Workaround for assigning from LDS memory
|
||||
std::transform(value, value+m, state, [](element_type& x)
|
||||
{
|
||||
T result;
|
||||
rocm_assign(result, x);
|
||||
return result;
|
||||
});
|
||||
a(state);
|
||||
std::copy(state, state+m, value);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Action, class Q = T>
|
||||
typename Impl::enable_if< !(sizeof(Q) <= 8) , void >::type
|
||||
action_at(std::size_t i, Action a) [[hc]]
|
||||
{
|
||||
element_type* value = (*this)[i];
|
||||
#if defined (ROCM15)
|
||||
a(value);
|
||||
#else
|
||||
//#if KOKKOS_ROCM_HAS_WORKAROUNDS
|
||||
if (m > get_max_tile_array_size()) return;
|
||||
T state[get_max_tile_array_size()];
|
||||
// std::copy(value, value+m, state);
|
||||
// Workaround for assigning from LDS memory
|
||||
std::transform(value, value+m, state, [](element_type& x)
|
||||
{
|
||||
T result;
|
||||
rocm_assign(result, x);
|
||||
return result;
|
||||
});
|
||||
a(state);
|
||||
// this workaround required when T is greater than 8 bytes
|
||||
tile_static char tv[64*sizeof(T)];
|
||||
size_t sT = sizeof(T);
|
||||
for (int j = 0; j<sT; j++) tv[i*sT+j] = ((char *)state)[j];
|
||||
for (int j = 0; j<sT; j++) ((tchar_type *)value)[j] = tv[i*sT+j];
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class Action>
|
||||
void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
|
||||
{
|
||||
this->action_at(i, [&](T* x)
|
||||
{
|
||||
this->action_at(j, [&](T* y)
|
||||
{
|
||||
a(x, y);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
std::size_t size() const [[hc]] [[cpu]]
|
||||
{
|
||||
return this->n;
|
||||
}
|
||||
|
||||
element_type* data() const [[hc]] [[cpu]]
|
||||
{
|
||||
return element_data;
|
||||
}
|
||||
};
|
||||
|
||||
// Zero initialize LDS memory
|
||||
struct zero_init_f
|
||||
{
|
||||
template<class T>
|
||||
#if defined (ROCM15)
|
||||
void operator()(T& x, std::size_t=1) const [[hc]]
|
||||
{
|
||||
auto * start = reinterpret_cast<char*>(&x);
|
||||
for(int i=0; i<sizeof(T);i++) start[i] = 0;
|
||||
rocm_raw_assign(x, T());
|
||||
}
|
||||
#else
|
||||
void operator()(__attribute__((address_space(3))) T& x, std::size_t=1) const [[hc]]
|
||||
{
|
||||
auto * start = reinterpret_cast<__attribute__((address_space(3))) char*>(&x);
|
||||
std::fill(start, start+sizeof(T), 0);
|
||||
rocm_raw_assign(x, T());
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class T>
|
||||
#if defined (ROCM15)
|
||||
void operator()(T* x, std::size_t size) const [[hc]]
|
||||
{
|
||||
rfor_each(x, x+size, *this);
|
||||
}
|
||||
#else
|
||||
void operator()(__attribute__((address_space(3))) T* x, std::size_t size) const [[hc]]
|
||||
{
|
||||
std::for_each(x, x+size, *this);
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
static constexpr zero_init_f zero_init = {};
|
||||
|
||||
struct tile_desc
|
||||
{
|
||||
// Number of work items, or size of extent
|
||||
std::size_t elements;
|
||||
// number of threads in team
|
||||
std::size_t team_size;
|
||||
// vector length of team
|
||||
std::size_t vector_length;
|
||||
// Size of tile
|
||||
std::size_t tile_size;
|
||||
// Size of array
|
||||
std::size_t array_size;
|
||||
// Number of tiles
|
||||
std::size_t num_tiles;
|
||||
// Per team reserved LDS memory, used for reduction
|
||||
std::size_t reduce_size;
|
||||
// Per team shared memory in LDS, this in addition to reduce shared mem
|
||||
std::size_t shared_size;
|
||||
std::size_t size;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
tile_desc get_tile_desc(std::size_t size,
|
||||
std::size_t array_size=1,
|
||||
std::size_t team_size=64,
|
||||
std::size_t vector_size=1,
|
||||
std::size_t shared_size=0)
|
||||
{
|
||||
tile_desc result;
|
||||
result.elements = size;
|
||||
result.array_size = array_size;
|
||||
result.vector_length = vector_size;
|
||||
result.team_size = team_size;
|
||||
result.tile_size = get_tile_size<T>(array_size,team_size,vector_size);
|
||||
result.num_tiles = std::ceil(1.0 * size / result.tile_size);
|
||||
result.reduce_size = result.tile_size * sizeof(T) * array_size;
|
||||
result.shared_size = shared_size;
|
||||
result.size = result.tile_size * result.num_tiles;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class U, class F, class T=typename std::remove_extent<U>::type>
|
||||
hc::completion_future tile_for(tile_desc td, F f)
|
||||
{
|
||||
assert(td.array_size <= get_max_tile_array_size() && "Exceed max array size");
|
||||
assert(((td.size % td.tile_size) == 0) && "Tile size must be divisible by extent");
|
||||
auto grid = hc::extent<1>(td.size).tile_with_dynamic(
|
||||
td.tile_size, td.reduce_size + td.shared_size);
|
||||
// grid.set_dynamic_group_segment_size(td.reduce_size + td.shared_size);
|
||||
return parallel_for_each(grid, [=](hc::tiled_index<1> t_idx) [[hc]]
|
||||
{
|
||||
#if defined (ROCM15)
|
||||
typedef T group_t;
|
||||
#else
|
||||
typedef __attribute__((address_space(3))) T group_t;
|
||||
#endif
|
||||
group_t * buffer = (group_t *)hc::get_dynamic_group_segment_base_pointer();
|
||||
tile_buffer<U> tb(buffer, td.tile_size, td.array_size);
|
||||
zero_init(tb[t_idx.local[0]], td.array_size);
|
||||
f(t_idx, tb);
|
||||
});
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
#endif
|
||||
346
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
Normal file
346
lib/kokkos/core/src/ROCm/Kokkos_ROCm_Vectorization.hpp
Normal file
@ -0,0 +1,346 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_ROCM_VECTORIZATION_HPP
|
||||
#define KOKKOS_ROCM_VECTORIZATION_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if ROCM is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_ROCM
|
||||
|
||||
#include <Kokkos_ROCm.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
using namespace hc;
|
||||
|
||||
// Shuffle only makes sense on >= Fiji GPUs; it doesn't work on CPUs
|
||||
// or other GPUs. We provide a generic definition (which is trivial
|
||||
// and doesn't do what it claims to do) because we don't actually use
|
||||
// this function unless we are on a suitable GPU, with a suitable
|
||||
// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow"
|
||||
// internal parameter depends both on the ExecutionSpace and the Scalar type,
|
||||
// and it controls whether shfl_down() gets called.)
|
||||
namespace Impl {
|
||||
|
||||
template< typename Scalar >
|
||||
struct shfl_union {
|
||||
enum {n = sizeof(Scalar)/4};
|
||||
float fval[n];
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar value() {
|
||||
return *(Scalar*) fval;
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator= (Scalar& value_) {
|
||||
float* const val_ptr = (float*) &value_;
|
||||
for(int i=0; i<n ; i++) {
|
||||
fval[i] = val_ptr[i];
|
||||
}
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator= (const Scalar& value_) {
|
||||
float* const val_ptr = (float*) &value_;
|
||||
for(int i=0; i<n ; i++) {
|
||||
fval[i] = val_ptr[i];
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#ifdef __HCC_ACCELERATOR__
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int __long2loint(const long val ) {
|
||||
union {
|
||||
long l;
|
||||
int i[2];
|
||||
} u;
|
||||
u.l = val;
|
||||
return u.i[0];
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int __long2hiint(const long val ) {
|
||||
union {
|
||||
long l;
|
||||
int i[2];
|
||||
} u;
|
||||
u.l = val;
|
||||
return u.i[1];
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int __double2loint(const double val ) {
|
||||
union {
|
||||
double d;
|
||||
int i[2];
|
||||
} u;
|
||||
u.d = val;
|
||||
return u.i[0];
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int __double2hiint(const double val ) {
|
||||
union {
|
||||
double d;
|
||||
int i[2];
|
||||
} u;
|
||||
u.d = val;
|
||||
return u.i[1];
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long __hiloint2long(const int hi, const int lo ) {
|
||||
union {
|
||||
long l;
|
||||
int i[2];
|
||||
} u;
|
||||
u.i[0] = lo;
|
||||
u.i[1] = hi;
|
||||
return u.l;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double __hiloint2double(const int hi, const int lo ) {
|
||||
union {
|
||||
double d;
|
||||
int i[2];
|
||||
} u;
|
||||
u.i[0] = lo;
|
||||
u.i[1] = hi;
|
||||
return u.d;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int shfl(const int &val, const int& srcLane, const int& width ) {
|
||||
return __shfl(val,srcLane,width);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float shfl(const float &val, const int& srcLane, const int& width ) {
|
||||
return __shfl(val,srcLane,width);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
|
||||
) {
|
||||
Scalar tmp1 = val;
|
||||
float tmp = *reinterpret_cast<float*>(&tmp1);
|
||||
tmp = __shfl(tmp,srcLane,width);
|
||||
return *reinterpret_cast<Scalar*>(&tmp);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double shfl(const double &val, const int& srcLane, const int& width) {
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
lo = __shfl(lo,srcLane,width);
|
||||
hi = __shfl(hi,srcLane,width);
|
||||
return __hiloint2double(hi,lo);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
|
||||
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
|
||||
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
|
||||
lo = __shfl(lo,srcLane,width);
|
||||
hi = __shfl(hi,srcLane,width);
|
||||
const double tmp = __hiloint2double(hi,lo);
|
||||
return *(reinterpret_cast<const Scalar*>(&tmp));
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
|
||||
Impl::shfl_union<Scalar> s_val;
|
||||
Impl::shfl_union<Scalar> r_val;
|
||||
s_val = val;
|
||||
|
||||
for(int i = 0; i<s_val.n; i++)
|
||||
r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
|
||||
return r_val.value();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int shfl_down(const int &val, const int& delta, const int& width) {
|
||||
return __shfl_down(val,delta,width);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float shfl_down(const float &val, const int& delta, const int& width) {
|
||||
return __shfl_down(val,delta,width);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
|
||||
Scalar tmp1 = val;
|
||||
float tmp = *reinterpret_cast<float*>(&tmp1);
|
||||
tmp = __shfl_down(tmp,delta,width);
|
||||
return *reinterpret_cast<Scalar*>(&tmp);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
long shfl_down(const long &val, const int& delta, const int& width) {
|
||||
int lo = __long2loint(val);
|
||||
int hi = __long2hiint(val);
|
||||
lo = __shfl_down(lo,delta,width);
|
||||
hi = __shfl_down(hi,delta,width);
|
||||
return __hiloint2long(hi,lo);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double shfl_down(const double &val, const int& delta, const int& width) {
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
lo = __shfl_down(lo,delta,width);
|
||||
hi = __shfl_down(hi,delta,width);
|
||||
return __hiloint2double(hi,lo);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
|
||||
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
|
||||
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
|
||||
lo = __shfl_down(lo,delta,width);
|
||||
hi = __shfl_down(hi,delta,width);
|
||||
const double tmp = __hiloint2double(hi,lo);
|
||||
return *(reinterpret_cast<const Scalar*>(&tmp));
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
|
||||
Impl::shfl_union<Scalar> s_val;
|
||||
Impl::shfl_union<Scalar> r_val;
|
||||
s_val = val;
|
||||
|
||||
for(int i = 0; i<s_val.n; i++)
|
||||
r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
|
||||
return r_val.value();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int shfl_up(const int &val, const int& delta, const int& width ) {
|
||||
return __shfl_up(val,delta,width);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
float shfl_up(const float &val, const int& delta, const int& width ) {
|
||||
return __shfl_up(val,delta,width);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
|
||||
Scalar tmp1 = val;
|
||||
float tmp = *reinterpret_cast<float*>(&tmp1);
|
||||
tmp = __shfl_up(tmp,delta,width);
|
||||
return *reinterpret_cast<Scalar*>(&tmp);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
double shfl_up(const double &val, const int& delta, const int& width ) {
|
||||
int lo = __double2loint(val);
|
||||
int hi = __double2hiint(val);
|
||||
lo = __shfl_up(lo,delta,width);
|
||||
hi = __shfl_up(hi,delta,width);
|
||||
return __hiloint2double(hi,lo);
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
|
||||
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
|
||||
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
|
||||
lo = __shfl_up(lo,delta,width);
|
||||
hi = __shfl_up(hi,delta,width);
|
||||
const double tmp = __hiloint2double(hi,lo);
|
||||
return *(reinterpret_cast<const Scalar*>(&tmp));
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
|
||||
Impl::shfl_union<Scalar> s_val;
|
||||
Impl::shfl_union<Scalar> r_val;
|
||||
s_val = val;
|
||||
|
||||
for(int i = 0; i<s_val.n; i++)
|
||||
r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
|
||||
return r_val.value();
|
||||
}
|
||||
|
||||
#else
|
||||
template<typename Scalar>
|
||||
inline
|
||||
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
|
||||
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<8.0.");
|
||||
return val;
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
inline
|
||||
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
|
||||
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
|
||||
return val;
|
||||
}
|
||||
|
||||
template<typename Scalar>
|
||||
inline
|
||||
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
|
||||
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
|
||||
return val;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
#endif // KOKKOS_ENABLE_ROCM
|
||||
#endif
|
||||
367
lib/kokkos/core/src/ROCm/hc_math_std.hpp
Normal file
367
lib/kokkos/core/src/ROCm/hc_math_std.hpp
Normal file
@ -0,0 +1,367 @@
|
||||
#pragma once
|
||||
|
||||
#include "hc.hpp"
|
||||
#include <cmath>
|
||||
|
||||
// Math functions with integer overloads will be converted to
|
||||
// this floating point type.
|
||||
#define HC_IMPLICIT_FLOAT_CONV double
|
||||
|
||||
#ifdef __KALMAR_ACCELERATOR__
|
||||
|
||||
#define HC_MATH_WRAPPER_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define KALMAR_MATH_WRAPPER_1(function, arg1) HC_MATH_WRAPPER_1(function, arg1)
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
|
||||
} \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1)
|
||||
|
||||
#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
|
||||
template<typename T> \
|
||||
inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TQ(function, arg1) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(Q arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
|
||||
template<typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<Q>::value,T>::type \
|
||||
function(Q arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
|
||||
}\
|
||||
template<typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
|
||||
function(Q arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
|
||||
template<typename T, typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
|
||||
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
|
||||
}\
|
||||
template<typename T, typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
|
||||
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1,arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
|
||||
template<typename T, typename Q> \
|
||||
inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
|
||||
hc::precise_math::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define HC_MATH_WRAPPER_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1); \
|
||||
}
|
||||
|
||||
#define KALMAR_MATH_WRAPPER_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return ::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
|
||||
} \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1); \
|
||||
}
|
||||
|
||||
#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
|
||||
} \
|
||||
template<typename T> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
|
||||
function(T arg1) __attribute__((hc,cpu)) { \
|
||||
return hc::precise_math::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
|
||||
template<typename T> \
|
||||
inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
|
||||
template<typename T> \
|
||||
inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TQ(function, arg1) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(Q arg1) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
|
||||
template<typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<Q>::value,T>::type \
|
||||
function(Q arg1) __attribute__((hc)) { \
|
||||
return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
|
||||
}\
|
||||
template<typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
|
||||
function(Q arg1) __attribute__((hc)) { \
|
||||
return std::function(arg1); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1, arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
|
||||
template<typename T, typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
|
||||
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
|
||||
}\
|
||||
template<typename T, typename Q> \
|
||||
inline \
|
||||
typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
|
||||
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1,arg2); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
|
||||
template<typename T, typename Q> \
|
||||
inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
|
||||
return std::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
|
||||
template<typename T, typename Q> \
|
||||
inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
|
||||
std::function(arg1, arg2, arg3); \
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// override global math functions
|
||||
namespace std {
|
||||
|
||||
// following math functions are NOT available because they don't have a GPU implementation
|
||||
//
|
||||
// erfinv
|
||||
// erfcinv
|
||||
// fpclassify
|
||||
//
|
||||
// following math functions are NOT available because they don't have a CPU implementation
|
||||
//
|
||||
// cospif
|
||||
// cospi
|
||||
// rsqrtf
|
||||
// rsqrt
|
||||
// sinpif
|
||||
// sinpi
|
||||
// tanpi
|
||||
//
|
||||
|
||||
HC_MATH_WRAPPER_TQ(ilogbf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(ilogb, int, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isfinite, bool, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isinf, bool, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnan, bool, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnormal, bool, x)
|
||||
HC_MATH_WRAPPER_TQ(nanf, tagp)
|
||||
HC_MATH_WRAPPER_TQ(nan, tagp)
|
||||
//HC_MATH_WRAPPER_TQ(signbitf, x)
|
||||
HC_MATH_WRAPPER_TQ(signbit, x)
|
||||
HC_MATH_WRAPPER_TTQ(frexpf, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(frexp, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(ldexpf, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(ldexp, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(lgammaf, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(lgamma, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(modff, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(modf, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(scalbnf, x, exp)
|
||||
HC_MATH_WRAPPER_TTQ(scalbn, x, exp)
|
||||
HC_MATH_WRAPPER_TTTQ(remquof, x, y, quo)
|
||||
HC_MATH_WRAPPER_TTTQ(remquo, x, y, quo)
|
||||
HC_MATH_WRAPPER_VTQQ(sincosf, x, s, c)
|
||||
HC_MATH_WRAPPER_VTQQ(sincos, x, s, c)
|
||||
|
||||
HC_MATH_WRAPPER_1(acosf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(acos, x)
|
||||
HC_MATH_WRAPPER_1(acoshf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(acosh, x)
|
||||
HC_MATH_WRAPPER_1(asinf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(asin, x)
|
||||
HC_MATH_WRAPPER_1(asinhf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(asinh, x)
|
||||
HC_MATH_WRAPPER_1(atanf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(atan, x)
|
||||
HC_MATH_WRAPPER_1(atanhf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(atanh, x)
|
||||
HC_MATH_WRAPPER_2(atan2f, x, y)
|
||||
HC_MATH_WRAPPER_2(atan2, x, y)
|
||||
HC_MATH_WRAPPER_1(cbrtf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(cbrt, x)
|
||||
HC_MATH_WRAPPER_1(ceilf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(ceil, x)
|
||||
HC_MATH_WRAPPER_2(copysignf, x, y)
|
||||
HC_MATH_WRAPPER_2(copysign, x, y)
|
||||
HC_MATH_WRAPPER_1(cosf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(cos, x)
|
||||
HC_MATH_WRAPPER_1(coshf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(cosh, x)
|
||||
KALMAR_MATH_WRAPPER_1(cospif, x)
|
||||
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(cospi, x)
|
||||
HC_MATH_WRAPPER_1(erff, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(erf, x)
|
||||
HC_MATH_WRAPPER_1(erfcf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(erfc, x)
|
||||
HC_MATH_WRAPPER_1(expf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp, x)
|
||||
HC_MATH_WRAPPER_1(exp2f, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp2, x)
|
||||
HC_MATH_WRAPPER_1(exp10f, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp10, x)
|
||||
HC_MATH_WRAPPER_1(expm1f, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(expm1, x)
|
||||
HC_MATH_WRAPPER_1(fabsf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(fabs, x)
|
||||
HC_MATH_WRAPPER_2(fdimf, x, y)
|
||||
HC_MATH_WRAPPER_2(fdim, x, y)
|
||||
HC_MATH_WRAPPER_1(floorf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(floor, x)
|
||||
HC_MATH_WRAPPER_3(fmaf, x, y, z)
|
||||
HC_MATH_WRAPPER_3(fma, x, y, z)
|
||||
HC_MATH_WRAPPER_2(fmaxf, x, y)
|
||||
HC_MATH_WRAPPER_2(fmax, x, y)
|
||||
HC_MATH_WRAPPER_2(fminf, x, y)
|
||||
HC_MATH_WRAPPER_2(fmin, x, y)
|
||||
HC_MATH_WRAPPER_2(fmodf, x, y)
|
||||
HC_MATH_WRAPPER_2(fmod, x, y)
|
||||
HC_MATH_WRAPPER_2(hypotf, x, y)
|
||||
HC_MATH_WRAPPER_2(hypot, x, y)
|
||||
HC_MATH_WRAPPER_1(logf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(log, x)
|
||||
HC_MATH_WRAPPER_1(log10f, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(log10, x)
|
||||
HC_MATH_WRAPPER_1(log2f, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(log2, x)
|
||||
HC_MATH_WRAPPER_1(log1pf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(log1p, x)
|
||||
HC_MATH_WRAPPER_1(logbf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(logb, x)
|
||||
HC_MATH_WRAPPER_1(nearbyintf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(nearbyint, x)
|
||||
HC_MATH_WRAPPER_2(nextafterf, x, y)
|
||||
HC_MATH_WRAPPER_2(nextafter, x, y)
|
||||
HC_MATH_WRAPPER_2(powf, x, y)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(pow,x,y)
|
||||
//HC_MATH_WRAPPER_1(rcbrtf, x)
|
||||
//HC_MATH_WRAPPER_1(rcbrt, x)
|
||||
HC_MATH_WRAPPER_2(remainderf, x, y)
|
||||
HC_MATH_WRAPPER_2(remainder, x, y)
|
||||
HC_MATH_WRAPPER_1(roundf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(round, x)
|
||||
KALMAR_MATH_WRAPPER_1(rsqrtf, x)
|
||||
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(rsqrt, x)
|
||||
HC_MATH_WRAPPER_2(scalbf, x, exp)
|
||||
HC_MATH_WRAPPER_2(scalb, x, exp)
|
||||
HC_MATH_WRAPPER_1(sinf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(sin, x)
|
||||
HC_MATH_WRAPPER_1(sinhf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(sinh, x)
|
||||
KALMAR_MATH_WRAPPER_1(sinpif, x)
|
||||
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(sinpi, x)
|
||||
HC_MATH_WRAPPER_1(sqrtf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(sqrt, x)
|
||||
HC_MATH_WRAPPER_1(tgammaf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(tgamma, x)
|
||||
HC_MATH_WRAPPER_1(tanf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(tan, x)
|
||||
HC_MATH_WRAPPER_1(tanhf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(tanh, x)
|
||||
HC_MATH_WRAPPER_1(truncf, x)
|
||||
HC_MATH_WRAPPER_FP_OVERLOAD_1(trunc, x)
|
||||
|
||||
//HC_MATH_ALIAS_2(min, fmin, x, y)
|
||||
//HC_MATH_ALIAS_2(max, fmax, x, y)
|
||||
|
||||
} // namespace
|
||||
|
||||
@ -125,6 +125,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
|
||||
//----------------------------------------------------------------------------
|
||||
// GCC native CAS supports int, long, unsigned int, unsigned long.
|
||||
// Intel native CAS support int and long with the same interface as GCC.
|
||||
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
|
||||
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
|
||||
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
|
||||
|
||||
@ -280,6 +281,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // !defined ROCM_ATOMICS
|
||||
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -158,6 +158,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
#endif
|
||||
#endif
|
||||
//----------------------------------------------------------------------------
|
||||
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
|
||||
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
|
||||
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
|
||||
|
||||
@ -355,6 +356,7 @@ T atomic_fetch_add( volatile T * const dest , const T val )
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // !defined ROCM_ATOMICS
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
// Simpler version of atomic_fetch_add without the fetch
|
||||
|
||||
@ -135,6 +135,7 @@ T atomic_fetch_sub( volatile T * const dest ,
|
||||
#endif
|
||||
#endif
|
||||
//----------------------------------------------------------------------------
|
||||
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
|
||||
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
|
||||
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
|
||||
|
||||
@ -263,6 +264,8 @@ T atomic_fetch_sub( volatile T * const dest , const T val )
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif // !defined ROCM_ATOMICS
|
||||
|
||||
// Simpler version of atomic_fetch_sub without the fetch
|
||||
template <typename T>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
|
||||
@ -238,7 +238,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
|
||||
*dest = Oper::apply(return_val, val);
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
return return_val;
|
||||
#else
|
||||
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
|
||||
// This is a way to (hopefully) avoid dead lock in a warp
|
||||
T return_val;
|
||||
int done = 0;
|
||||
@ -277,7 +277,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
|
||||
*dest = return_val;
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
return return_val;
|
||||
#else
|
||||
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
|
||||
T return_val;
|
||||
// This is a way to (hopefully) avoid dead lock in a warp
|
||||
int done = 0;
|
||||
|
||||
@ -62,6 +62,8 @@ int bit_first_zero( unsigned i ) noexcept
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return full != i ? __ffs( ~i ) - 1 : -1 ;
|
||||
#elif defined( __HCC_ACCELERATOR__ )
|
||||
return full != i ? (int)hc::__firstbit_u32_u32(~i) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return full != i ? _bit_scan_forward( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
@ -82,6 +84,8 @@ int bit_scan_forward( unsigned i )
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __ffs(i) - 1;
|
||||
#elif defined( __HCC_ACCELERATOR__ )
|
||||
return (int)hc::__firstbit_u32_u32(i);
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return _bit_scan_forward(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
@ -106,6 +110,8 @@ int bit_scan_reverse( unsigned i )
|
||||
enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return shift - __clz(i);
|
||||
#elif defined( __HCC_ACCELERATOR__ )
|
||||
return (int)hc::__firstbit_u32_u32(i);
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return _bit_scan_reverse(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
@ -130,6 +136,8 @@ int bit_count( unsigned i )
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __popc(i);
|
||||
#elif defined( __HCC_ACCELERATOR__ )
|
||||
return (int)hc::__popcount_u32_b32(i);
|
||||
#elif defined ( __INTEL_COMPILER )
|
||||
return _popcnt32(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
|
||||
@ -72,6 +72,10 @@ uint64_t clock_tic(void) noexcept
|
||||
|
||||
return clock64();
|
||||
|
||||
#elif defined(__HCC_ACCELERATOR__)
|
||||
// Get clock register
|
||||
return hc::__clock_u64();
|
||||
|
||||
#elif defined( __i386__ ) || defined( __x86_64 )
|
||||
|
||||
// Return value of 64-bit hi-res clock register.
|
||||
|
||||
@ -80,7 +80,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
const int num_threads = args.num_threads;
|
||||
const int use_numa = args.num_numa;
|
||||
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
|
||||
const int use_gpu = args.device_id;
|
||||
#endif // defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
@ -162,6 +162,18 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
|
||||
if (use_gpu > -1) {
|
||||
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( use_gpu ) );
|
||||
}
|
||||
else {
|
||||
Kokkos::Experimental::ROCm::initialize();
|
||||
}
|
||||
std::cout << "Kokkos::initialize() fyi: ROCm enabled and initialized" << std::endl ;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
@ -181,6 +193,13 @@ void finalize_internal( const bool all_spaces = false )
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
|
||||
if(Kokkos::Experimental::ROCm::is_initialized())
|
||||
Kokkos::Experimental::ROCm::finalize();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
|
||||
if(Kokkos::Experimental::OpenMPTarget::is_initialized())
|
||||
@ -225,6 +244,12 @@ void fence_internal()
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ROCM )
|
||||
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value ) {
|
||||
Kokkos::Experimental::ROCm::fence();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
|
||||
|
||||
@ -75,7 +75,7 @@ void abort( const char * const message ) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
Kokkos::Impl::cuda_abort(message);
|
||||
#else
|
||||
#ifndef KOKKOS_ENABLE_OPENMPTARGET
|
||||
#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__HCC_ACCELERATOR__)
|
||||
Kokkos::Impl::host_abort(message);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -275,7 +275,9 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
|
||||
for ( int i = 0 ; i < end ; ++i ) {
|
||||
((int8_t*) & value )[i] = int8_t( step );
|
||||
}
|
||||
|
||||
// Do not REMOVE this store fence!!!
|
||||
// Makes stuff hang on GCC with more than 8 threads
|
||||
store_fence();
|
||||
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
|
||||
, value );
|
||||
}
|
||||
|
||||
@ -53,6 +53,8 @@ void memory_fence()
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__threadfence();
|
||||
#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
|
||||
amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
|
||||
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
asm volatile (
|
||||
"mfence" ::: "memory"
|
||||
|
||||
@ -140,12 +140,6 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
#ifndef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX1Z
|
||||
#ifndef KOKKOS_ENABLE_CXX1Z
|
||||
#define KOKKOS_ENABLE_CXX1Z KOKKOS_HAVE_CXX1Z
|
||||
|
||||
@ -456,10 +456,11 @@ void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
// task->m_next == member of linked list (queue)
|
||||
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
, uintptr_t(task->m_next)
|
||||
, task->m_dep_count
|
||||
, task->m_task_type
|
||||
, task->m_priority
|
||||
, task->m_ref_count );
|
||||
@ -597,7 +598,6 @@ void TaskQueue< ExecSpace >::complete
|
||||
, task->m_task_type
|
||||
, task->m_priority
|
||||
, task->m_ref_count );
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
task_root_type volatile & t = *task ;
|
||||
|
||||
@ -1015,8 +1015,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
|
||||
constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
|
||||
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
|
||||
{
|
||||
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
|
||||
, "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" );
|
||||
static_assert(
|
||||
( DimRHS::rank == 0 &&
|
||||
dimension_type::rank == 0 ) ||
|
||||
( DimRHS::rank == 1 &&
|
||||
dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1 )
|
||||
, "ViewOffset LayoutLeft and LayoutRight are only compatible when rank <= 1" );
|
||||
}
|
||||
|
||||
template< class DimRHS >
|
||||
@ -1024,8 +1029,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
|
||||
ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
|
||||
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
|
||||
{
|
||||
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
|
||||
, "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
|
||||
static_assert(
|
||||
( DimRHS::rank == 0 &&
|
||||
dimension_type::rank == 0 ) ||
|
||||
( DimRHS::rank == 1 &&
|
||||
dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1 )
|
||||
, "ViewOffset LayoutLeft and LayoutStride are only compatible when rank <= 1" );
|
||||
if ( rhs.m_stride.S0 != 1 ) {
|
||||
Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride requires stride == 1" );
|
||||
}
|
||||
@ -1493,8 +1503,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
|
||||
constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
|
||||
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
|
||||
{
|
||||
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
|
||||
, "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" );
|
||||
static_assert(
|
||||
( DimRHS::rank == 0 &&
|
||||
dimension_type::rank == 0 ) ||
|
||||
( DimRHS::rank == 1 &&
|
||||
dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1 )
|
||||
, "ViewOffset LayoutRight and LayoutLeft are only compatible when rank <= 1" );
|
||||
}
|
||||
|
||||
template< class DimRHS >
|
||||
@ -1502,8 +1517,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
|
||||
ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
|
||||
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
|
||||
{
|
||||
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
|
||||
, "ViewOffset LayoutLeft/Right and LayoutStride are only compatible when rank == 1" );
|
||||
static_assert(
|
||||
( DimRHS::rank == 0 &&
|
||||
dimension_type::rank == 0 ) ||
|
||||
( DimRHS::rank == 1 &&
|
||||
dimension_type::rank == 1 &&
|
||||
dimension_type::rank_dynamic == 1 )
|
||||
, "ViewOffset LayoutRight and LayoutString are only compatible when rank <= 1" );
|
||||
if ( rhs.m_stride.S0 != 1 ) {
|
||||
Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft/Right from LayoutStride requires stride == 1" );
|
||||
}
|
||||
|
||||
@ -143,7 +143,7 @@ public:
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
~ViewOffset() = default ;
|
||||
KOKKOS_FUNCTION_DEFAULTED ~ViewOffset() = default ;
|
||||
KOKKOS_INLINE_FUNCTION ViewOffset() = default ;
|
||||
KOKKOS_INLINE_FUNCTION ViewOffset( const ViewOffset & ) = default ;
|
||||
KOKKOS_INLINE_FUNCTION ViewOffset & operator = ( const ViewOffset & ) = default ;
|
||||
|
||||
Reference in New Issue
Block a user