Update Kokkos library to v2.04.00

This commit is contained in:
Stan Moore
2017-08-22 13:42:02 -06:00
parent b11fe2eddb
commit 090c792d90
112 changed files with 10885 additions and 154 deletions

View File

@ -9,30 +9,6 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
# use of lambdas at the outer level of parallel dispatch (that is, as
# the argument to an outer parallel_for, parallel_reduce, or
# parallel_scan). This works with non-CUDA execution spaces if C++11
# is enabled. It does not currently work with public releases of
# CUDA. If that changes, please change the default here to ON if CUDA
# and C++11 are ON.
IF (${PROJECT_NAME}_ENABLE_CXX11)
IF (${PACKAGE_NAME}_ENABLE_CUDA)
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
ELSE ()
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
ENDIF ()
ELSE ()
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
ENDIF ()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
"Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
)
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})

View File

@ -152,10 +152,10 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return pointer(0) ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
~Array() = default ;
Array() = default ;
Array( const Array & ) = default ;
Array & operator = ( const Array & ) = default ;
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
KOKKOS_FUNCTION_DEFAULTED Array() = default ;
KOKKOS_FUNCTION_DEFAULTED Array( const Array & ) = default ;
KOKKOS_FUNCTION_DEFAULTED Array & operator = ( const Array & ) = default ;
// Some supported compilers are not sufficiently C++11 compliant
// for default move constructor and move assignment operator.
@ -209,7 +209,7 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
~Array() = default ;
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
Array() = delete ;
Array( const Array & rhs ) = delete ;
@ -278,7 +278,7 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
~Array() = default ;
KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
Array() = delete ;
Array( const Array & ) = delete ;

View File

@ -80,6 +80,11 @@
// Compiling NVIDIA device code, must use Cuda atomics:
#define KOKKOS_ENABLE_CUDA_ATOMICS
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU)
#define KOKKOS_ENABLE_ROCM_ATOMICS
#endif
#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
@ -154,6 +159,19 @@ const char * atomic_query_version()
} // namespace Kokkos
#if defined( KOKKOS_ENABLE_ROCM )
#include <ROCm/Kokkos_ROCm_Atomic.hpp>
namespace Kokkos {
namespace Impl {
extern KOKKOS_INLINE_FUNCTION
bool lock_address_rocm_space(void* ptr);
extern KOKKOS_INLINE_FUNCTION
void unlock_address_rocm_space(void* ptr);
}
}
#endif
#ifdef _WIN32
#include "impl/Kokkos_Atomic_Windows.hpp"
#else

View File

@ -107,6 +107,11 @@ public:
re_ (val), im_ (0.0)
{}
// BUG HCC WORKAROUND
KOKKOS_INLINE_FUNCTION complex( const RealType& re, const RealType& im):
re_ (re), im_ (im)
{}
//! Constructor that takes the real and imaginary parts.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
@ -227,6 +232,16 @@ public:
return re_;
}
//! Set the imaginary part of this complex number.
KOKKOS_INLINE_FUNCTION void imag (RealType v) {
im_ = v;
}
//! Set the real part of this complex number.
KOKKOS_INLINE_FUNCTION void real (RealType v) {
re_ = v;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const complex<RealType>& src) {
re_ += src.re_;
@ -299,7 +314,7 @@ public:
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@ -537,7 +552,7 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,

View File

@ -74,6 +74,10 @@
#include <Kokkos_Cuda.hpp>
#endif
#if defined( KOKKOS_ENABLE_ROCM )
#include <Kokkos_ROCm.hpp>
#endif
#include <Kokkos_Pair.hpp>
#include <Kokkos_MemoryPool.hpp>
#include <Kokkos_Array.hpp>

View File

@ -122,6 +122,13 @@ class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU
class Cuda; ///< Execution space for Cuda GPU
#endif
#if defined( KOKKOS_ENABLE_ROCM )
namespace Experimental {
class ROCmSpace ; ///< Memory space on ROCm GPU
class ROCm ; ///< Execution space for ROCm GPU
}
#endif
template<class ExecutionSpace, class MemorySpace>
struct Device;
@ -140,6 +147,8 @@ namespace Kokkos {
typedef Cuda DefaultExecutionSpace;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
typedef Experimental::ROCm DefaultExecutionSpace ;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -185,6 +194,8 @@ namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace;
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU )
typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
typedef Kokkos::HostSpace ActiveExecutionMemorySpace;
#else

View File

@ -98,18 +98,18 @@ public:
typedef View<size_type* , array_layout, device_type> row_map_type;
typedef View<DataType* , array_layout, device_type> entries_type;
entries_type entries;
row_map_type row_map;
entries_type entries;
//! Construct an empty view.
Crs () : entries(), row_map() {}
Crs() : row_map(), entries() {}
//! Copy constructor (shallow copy).
Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
{}
template<class EntriesType, class RowMapType>
Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
{}
/** \brief Assign to a view of the rhs array.
@ -117,8 +117,8 @@ public:
* then allocated memory is deallocated.
*/
Crs& operator= (const Crs& rhs) {
entries = rhs.entries;
row_map = rhs.row_map;
entries = rhs.entries;
return *this;
}
@ -151,7 +151,7 @@ void get_crs_transpose_counts(
template< class OutCounts,
class InCrs>
void get_crs_row_map_from_counts(
typename OutCounts::value_type get_crs_row_map_from_counts(
OutCounts& out,
InCrs const& in,
std::string const& name = "row_map");
@ -204,18 +204,20 @@ class CrsRowMapFromCounts {
using execution_space = typename InCounts::execution_space;
using value_type = typename OutRowMap::value_type;
using index_type = typename InCounts::size_type;
using last_value_type = Kokkos::View<value_type, execution_space>;
private:
InCounts in;
OutRowMap out;
InCounts m_in;
OutRowMap m_out;
last_value_type m_last_value;
public:
KOKKOS_INLINE_FUNCTION
void operator()(index_type i, value_type& update, bool final_pass) const {
update += in(i);
if (final_pass) {
out(i + 1) = update;
if (i == 0) {
out(0) = 0;
}
if (i < m_in.size()) {
update += m_in(i);
if (final_pass) m_out(i + 1) = update;
} else if (final_pass) {
m_out(0) = 0;
m_last_value() = update;
}
}
KOKKOS_INLINE_FUNCTION
@ -226,12 +228,16 @@ class CrsRowMapFromCounts {
}
using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
in(arg_in),out(arg_out) {
m_in(arg_in), m_out(arg_out), m_last_value("last_value") {
}
value_type execute() {
using policy_type = RangePolicy<index_type, execution_space>;
using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
closure_type closure(*this, policy_type(0, in.size()));
closure_type closure(*this, policy_type(0, m_in.size() + 1));
closure.execute();
execution_space::fence();
auto last_value = Kokkos::create_mirror_view(m_last_value);
Kokkos::deep_copy(last_value, m_last_value);
return last_value();
}
};
@ -297,13 +303,14 @@ void get_crs_transpose_counts(
template< class OutRowMap,
class InCounts>
void get_crs_row_map_from_counts(
typename OutRowMap::value_type get_crs_row_map_from_counts(
OutRowMap& out,
InCounts const& in,
std::string const& name) {
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
Kokkos::Impl::Experimental::
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
return functor.execute();
}
template< class DataType,
@ -328,6 +335,65 @@ void transpose_crs(
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
}
template< class CrsType,
class Functor>
struct CountAndFill {
using data_type = typename CrsType::size_type;
using size_type = typename CrsType::size_type;
using row_map_type = typename CrsType::row_map_type;
using entries_type = typename CrsType::entries_type;
using counts_type = row_map_type;
CrsType m_crs;
Functor m_functor;
counts_type m_counts;
struct Count {};
KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
m_counts(i) = m_functor(i, nullptr);
}
struct Fill {};
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
data_type* fill = &(m_crs.entries(j));
m_functor(i, fill);
}
using self_type = CountAndFill<CrsType, Functor>;
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
m_crs(crs),
m_functor(f)
{
using execution_space = typename CrsType::execution_space;
m_counts = counts_type("counts", nrows);
{
using count_policy_type = RangePolicy<size_type, execution_space, Count>;
using count_closure_type =
Kokkos::Impl::ParallelFor<self_type, count_policy_type>;
const count_closure_type closure(*this, count_policy_type(0, nrows));
closure.execute();
}
auto nentries = Kokkos::Experimental::
get_crs_row_map_from_counts(m_crs.row_map, m_counts);
m_counts = counts_type();
m_crs.entries = entries_type("entries", nentries);
{
using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
using fill_closure_type =
Kokkos::Impl::ParallelFor<self_type, fill_policy_type>;
const fill_closure_type closure(*this, fill_policy_type(0, nrows));
closure.execute();
}
crs = m_crs;
}
};
template< class CrsType,
class Functor>
void count_and_fill_crs(
CrsType& crs,
typename CrsType::size_type nrows,
Functor const& f) {
Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
}
}} // namespace Kokkos::Experimental
#endif /* #define KOKKOS_CRS_HPP */

View File

@ -96,6 +96,14 @@
//----------------------------------------------------------------------------
#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_THREADS) || \
defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_QTHREADS) || \
defined(KOKKOS_ENABLE_ROCM) || defined(KOKKOS_ENABLE_OPENMPTARGET)
#define KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND
#endif
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
// Compiling with a CUDA compiler.
//
@ -133,6 +141,9 @@
#if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
#define KOKKOS_LAMBDA [=]__device__
#if defined( KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND )
#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
#endif
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
@ -141,16 +152,13 @@
#endif
#endif
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
// Cuda version 8.0 still needs the functor wrapper
#if /* ( CUDA_VERSION < 8000 ) && */ defined( __NVCC__ )
#if defined( __NVCC__ )
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
#endif
#endif
#endif
#else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
#undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
#endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
//----------------------------------------------------------------------------
// Language info: C++, CUDA, OPENMP
@ -161,8 +169,20 @@
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
#define KOKKOS_FUNCTION __device__ __host__
#ifdef KOKKOS_COMPILER_CLANG
#define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
#endif
#endif // #if defined( __CUDA_ARCH__ )
#if defined( KOKKOS_ENABLE_ROCM ) && defined( __HCC__ )
#define KOKKOS_FORCEINLINE_FUNCTION __attribute__((amp,cpu)) inline
#define KOKKOS_INLINE_FUNCTION __attribute__((amp,cpu)) inline
#define KOKKOS_FUNCTION __attribute__((amp,cpu))
#define KOKKOS_LAMBDA [=] __attribute__((amp,cpu))
#define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
#endif
#if defined( _OPENMP )
// Compiling with OpenMP.
// The value of _OPENMP is an integer value YYYYMM
@ -179,15 +199,6 @@
// Host code is compiled again with another compiler.
// Device code is compile to 'ptx'.
#define KOKKOS_COMPILER_NVCC __NVCC__
#else
#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
#if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
// CUDA (including version 6.5) does not support giving lambdas as
// arguments to global functions. Thus its not currently possible
// to dispatch lambdas from the host.
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif
#endif // #if defined( __NVCC__ )
#if !defined( KOKKOS_LAMBDA )
@ -321,6 +332,10 @@
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#if ! defined( KOKKOS_ENABLE_ASM )
#define KOKKOS_ENABLE_ASM 1
#endif
#endif
//----------------------------------------------------------------------------
@ -397,6 +412,10 @@
#define KOKKOS_FUNCTION /**/
#endif
#if !defined( KOKKOS_FUNCTION_DEFAULTED )
#define KOKKOS_FUNCTION_DEFAULTED /**/
#endif
//----------------------------------------------------------------------------
// Define empty macro for restrict if necessary:
@ -424,6 +443,7 @@
// There is zero or one default execution space specified.
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
@ -435,6 +455,7 @@
// If default is not specified then chose from enabled execution spaces.
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -442,6 +463,8 @@
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined( KOKKOS_ENABLE_ROCM )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
#elif defined( KOKKOS_ENABLE_OPENMP )
@ -459,6 +482,8 @@
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
#elif defined( __HCC__ ) && defined( __HCC_ACCELERATOR__ ) && defined( KOKKOS_ENABLE_ROCM )
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU
#else
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#endif

View File

@ -233,12 +233,24 @@ public:
//--------------------------------------------------------------------------
MemoryPool() = default ;
MemoryPool( MemoryPool && ) = default ;
MemoryPool( const MemoryPool & ) = default ;
MemoryPool & operator = ( MemoryPool && ) = default ;
MemoryPool & operator = ( const MemoryPool & ) = default ;
MemoryPool()
: m_tracker()
, m_sb_state_array(0)
, m_sb_state_size(0)
, m_sb_size_lg2(0)
, m_max_block_size_lg2(0)
, m_min_block_size_lg2(0)
, m_sb_count(0)
, m_hint_offset(0)
, m_data_offset(0)
, m_unused_padding(0)
{}
/**\brief Allocate a memory pool from 'memspace'.
*
* The memory pool will have at least 'min_total_alloc_size' bytes

View File

@ -1016,7 +1016,7 @@ parallel_reduce( std::string const & arg_label
//------------------------------
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelReduce(arg_label, 0, &kpID);
@ -1042,7 +1042,7 @@ parallel_reduce( std::string const & arg_label
//------------------------------
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelReduce(kpID);
}

View File

@ -0,0 +1,220 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCM_HPP
#define KOKKOS_ROCM_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_ROCM )
#include <ROCm/hc_math_std.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ROCmSpace.hpp>
#include <ROCm/Kokkos_ROCm_Exec.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
#include <hc.hpp>
#include <hc_am.hpp>
#include <amp_math.h>
#if defined( __HCC_ACCELERATOR__ )
using namespace ::Concurrency::precise_math ;
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class ROCmExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
/// \class ROCm
/// \brief Kokkos device for multicore processors in the host memory space.
class ROCm {
public:
//------------------------------------
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as a kokkos execution space
typedef ROCm execution_space ;
typedef ROCmSpace memory_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef LayoutLeft array_layout ;
typedef HostSpace::size_type size_type ;
typedef ScratchMemorySpace< ROCm > scratch_memory_space ;
~ROCm() {}
ROCm();
// explicit ROCm( const int instance_id );
ROCm( ROCm && ) = default ;
ROCm( const ROCm & ) = default ;
ROCm & operator = ( ROCm && ) = default ;
ROCm & operator = ( const ROCm & ) = default ;
//@}
//------------------------------------
//! \name Functions that all Kokkos devices must implement.
//@{
KOKKOS_INLINE_FUNCTION static int in_parallel() {
#if defined( __HCC_ACCELERATOR__ )
return true;
#else
return false;
#endif
}
/** \brief Set the device in a "sleep" state. */
static bool sleep() ;
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
static bool wake() ;
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
static void fence() ;
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
/// \brief Free any resources being consumed by the device.
static void finalize() ;
/** \brief Initialize the device.
*
*/
struct SelectDevice {
int rocm_device_id ;
SelectDevice() : rocm_device_id(1) {}
explicit SelectDevice( int id ) : rocm_device_id( id+1 ) {}
};
int rocm_device() const { return m_device ; }
bool isAPU();
bool isAPU(int device);
static void initialize( const SelectDevice = SelectDevice());
static int is_initialized();
// static size_type device_arch();
// static size_type detect_device_count();
static int concurrency() ;
static const char* name();
private:
int m_device ;
};
}
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<>
struct MemorySpaceAccess
< Kokkos::Experimental::ROCmSpace
, Kokkos::Experimental::ROCm::scratch_memory_space
>
{
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = false };
};
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Experimental::ROCm::memory_space
, Kokkos::Experimental::ROCm::scratch_memory_space
>
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
};
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::HostSpace
, Kokkos::Experimental::ROCm::scratch_memory_space
>
{
enum { value = false };
inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
};
} // namespace Experimental
} // namespace Kokkos
#include <ROCm/Kokkos_ROCm_Parallel.hpp>
#include <ROCm/Kokkos_ROCm_Task.hpp>
#endif
#endif

View File

@ -0,0 +1,622 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCMSPACE_HPP
#define KOKKOS_ROCMSPACE_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_ROCM )
#include <iosfwd>
#include <typeinfo>
#include <string>
#include <Kokkos_HostSpace.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
/** \brief ROCm on-device memory management */
class ROCmSpace {
public:
//! Tag this class as a kokkos memory space
typedef ROCmSpace memory_space ;
typedef Kokkos::Experimental::ROCm execution_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
/*--------------------------------*/
ROCmSpace();
ROCmSpace( ROCmSpace && rhs ) = default ;
ROCmSpace( const ROCmSpace & rhs ) = default ;
ROCmSpace & operator = ( ROCmSpace && rhs ) = default ;
ROCmSpace & operator = ( const ROCmSpace & rhs ) = default ;
~ROCmSpace() = default ;
/**\brief Allocate untracked memory in the rocm space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate untracked memory in the rocm space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name() { return m_name; };
/*--------------------------------*/
/** \brief Error reporting for HostSpace attempt to access ROCmSpace */
static void access_error();
static void access_error( const void * const );
private:
int m_device ; ///< Which ROCm device
static constexpr const char* m_name = "ROCm";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
};
} // namespace Experimental
namespace Impl {
void * rocm_device_allocate(int);
void * rocm_hostpinned_allocate(int);
void rocm_device_free(void * );
/// \brief Initialize lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
void init_lock_arrays_rocm_space();
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
///
/// Team and Thread private scratch allocations in
/// global memory are aquired via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for unique identifiers.
///
/// Unique identifiers in the range 0-ROCm::concurrency
/// are provided via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* threadid_lock_array_rocm_space_ptr(bool deallocate = false);
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
/** \brief Host memory that is accessible to ROCm execution space
* through ROCm's host-pinned memory allocation.
*/
class ROCmHostPinnedSpace {
public:
//! Tag this class as a kokkos memory space
/** \brief Memory is in HostSpace so use the HostSpace::execution_space */
typedef HostSpace::execution_space execution_space ;
typedef ROCmHostPinnedSpace memory_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
/*--------------------------------*/
ROCmHostPinnedSpace();
ROCmHostPinnedSpace( ROCmHostPinnedSpace && rhs ) = default ;
ROCmHostPinnedSpace( const ROCmHostPinnedSpace & rhs ) = default ;
ROCmHostPinnedSpace & operator = ( ROCmHostPinnedSpace && rhs ) = default ;
ROCmHostPinnedSpace & operator = ( const ROCmHostPinnedSpace & rhs ) = default ;
~ROCmHostPinnedSpace() = default ;
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name() { return m_name; };
private:
static constexpr const char* m_name = "ROCmHostPinned";
/*--------------------------------*/
};
} // namespace Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace >::assignable , "" );
//----------------------------------------
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace > {
enum { assignable = false };
enum { accessible = false };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
// HostSpace::execution_space == ROCmHostPinnedSpace::execution_space
enum { assignable = true };
enum { accessible = true };
enum { deepcopy = true };
};
//----------------------------------------
template<>
struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace > {
enum { assignable = false };
enum { accessible = false };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
// ROCmSpace::execution_space != ROCmHostPinnedSpace::execution_space
enum { assignable = false };
enum { accessible = true }; // ROCmSpace::execution_space
enum { deepcopy = true };
};
//----------------------------------------
// ROCmHostPinnedSpace::execution_space == HostSpace::execution_space
// ROCmHostPinnedSpace accessible to both ROCm and Host
template<>
struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::HostSpace > {
enum { assignable = false }; // Cannot access from ROCm
enum { accessible = true }; // ROCmHostPinnedSpace::execution_space
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace > {
enum { assignable = false }; // Cannot access from Host
enum { accessible = false };
enum { deepcopy = true };
};
};
//----------------------------------------
} // namespace Kokkos::Impl
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n);
template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm>
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
fut.wait();
// DeepCopy (dst,src,n);
}
};
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
DeepCopy (dst,src,n);
}
};
template<class ExecutionSpace>
struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
DeepCopy (dst,src,n);
}
};
template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm>
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
};
template<class ExecutionSpace>
struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace>
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
fut.wait();
// DeepCopyROCm (dst,src,n);
}
};
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
fut.wait();
// DeepCopyROCm (dst,src,n);
}
};
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
// hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
// fut.wait();
// DeepCopyAsyncROCm (dst,src,n);
DeepCopy (dst,src,n);
}
};
template<class ExecutionSpace> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
DeepCopy (dst,src,n);
}
};
template<class ExecutionSpace>
struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
inline
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
{
exec.fence();
DeepCopy (dst,src,n);
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** Running in ROCmSpace attempting to access HostSpace: error */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static void verify( void )
{ Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
KOKKOS_INLINE_FUNCTION static void verify( const void * )
{ Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
};
/** Running in ROCmSpace accessing ROCmHostPinnedSpace: ok */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
};
/** Running in ROCmSpace attempting to access an unknown space: error */
template< class OtherSpace >
struct VerifyExecutionCanAccessMemorySpace<
typename enable_if< ! is_same<Kokkos::Experimental::ROCmSpace,OtherSpace>::value , Kokkos::Experimental::ROCmSpace >::type ,
OtherSpace >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static void verify( void )
{ Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
KOKKOS_INLINE_FUNCTION static void verify( const void * )
{ Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
};
//----------------------------------------------------------------------------
/** Running in HostSpace attempting to access ROCmSpace */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >
{
enum { value = false };
inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
};
/** Running in HostSpace accessing ROCmHostPinnedSpace is OK */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) {}
KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
static RecordBase s_root_record ;
const Kokkos::Experimental::ROCmSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const ;
static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::Experimental::ROCmSpace & , bool detail = false );
};
template<>
class SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
static RecordBase s_root_record ;
const Kokkos::Experimental::ROCmHostPinnedSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() : RecordBase(), m_space() {}
SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const ;
static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::Experimental::ROCmHostPinnedSpace & , bool detail = false );
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */
#endif /* #define KOKKOS_ROCMSPACE_HPP */

View File

@ -681,6 +681,67 @@ public:
return f ;
}
template < class F >
KOKKOS_FUNCTION
Future< execution_space >
when_all( int narg , F const func )
{
using input_type = decltype( func(0) );
using future_type = Future< execution_space > ;
using task_base = Kokkos::Impl::TaskBase< void , void , void > ;
static_assert( is_future< input_type >::value
, "Functor must return a Kokkos::Future" );
future_type f ;
if ( 0 == narg ) return f ;
size_t const alloc_size = m_queue->when_all_allocation_size( narg );
f.m_task =
reinterpret_cast< task_base * >( m_queue->allocate( alloc_size ) );
if ( f.m_task ) {
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
new( f.m_task ) task_base();
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = alloc_size ;
f.m_task->m_dep_count = narg ;
f.m_task->m_task_type = task_base::Aggregate ;
// Assign dependences, reference counts were already incremented
task_base * volatile * const dep =
f.m_task->aggregate_dependences();
for ( int i = 0 ; i < narg ; ++i ) {
const input_type arg_f = func(i);
if ( 0 != arg_f.m_task ) {
if ( m_queue != static_cast< queue_type * >( arg_f.m_task->m_queue ) ) {
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
}
// Increment reference count to track subsequent assignment.
Kokkos::atomic_increment( &(arg_f.m_task->m_ref_count) );
dep[i] = arg_f.m_task ;
}
}
Kokkos::memory_fence();
m_queue->schedule_aggregate( f.m_task );
// this when_all may be processed at any moment
}
return f ;
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION

View File

@ -2429,6 +2429,7 @@ template < class ValueType >
struct CommonViewAllocProp< void, ValueType >
{
using value_type = ValueType;
using scalar_array_type = ValueType;
template < class ... Views >
CommonViewAllocProp( const Views & ... ) {}

View File

@ -0,0 +1,439 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <hc.hpp>
//#include <hsa_atomic.h>
#ifdef KOKKOS_ENABLE_ROCM_ATOMICS
namespace Kokkos {
//ROCm can do:
//Types int/unsigned int
//variants: atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec
KOKKOS_INLINE_FUNCTION
int atomic_exchange(int* dest, const int& val) {
return hc::atomic_exchange_int(dest, val);
}
KOKKOS_INLINE_FUNCTION
unsigned int atomic_exchange(unsigned int* dest, const unsigned int& val) {
return hc::atomic_exchange_unsigned(dest, val);
}
KOKKOS_INLINE_FUNCTION
int64_t atomic_exchange(int64_t* dest, const int64_t& val) {
return (int64_t)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
uint64_t atomic_exchange(uint64_t* dest, const uint64_t& val) {
return hc::atomic_exchange_uint64(dest, val);
}
KOKKOS_INLINE_FUNCTION
long long atomic_exchange(long long* dest, const long long& val) {
return (long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
unsigned long long atomic_exchange(unsigned long long* dest, const unsigned long long& val) {
return (unsigned long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
float atomic_exchange(float* dest, const float& val) {
union U {
int i ;
float f ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,ival;
idest.f = *dest;
ival.f = val;
idest.i = hc::atomic_exchange_int((int*)dest, ival.i);
return idest.f;
}
KOKKOS_INLINE_FUNCTION
double atomic_exchange(double* dest, const double& val) {
union U {
uint64_t i ;
double d ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,ival;
idest.d = *dest;
ival.d = val;
idest.i = hc::atomic_exchange_uint64((uint64_t*)dest, ival.i);
return idest.d;
}
KOKKOS_INLINE_FUNCTION
int atomic_compare_exchange(int* dest, int compare, const int& val);
KOKKOS_INLINE_FUNCTION
int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val);
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
union U {
int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
assume.i = oldval.i ;
newval.t = val ;
atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
return oldval.t ;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
union U {
uint64_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
assume.i = oldval.i ;
newval.t = val ;
atomic_compare_exchange( (int64_t*)(dest) , assume.i, newval.i );
return oldval.t ;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_exchange(T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
return val;
}
KOKKOS_INLINE_FUNCTION
int atomic_compare_exchange(int* dest, int compare, const int& val) {
return hc::atomic_compare_exchange_int(dest, compare, val);
}
KOKKOS_INLINE_FUNCTION
unsigned int atomic_compare_exchange(unsigned int* dest, unsigned int compare, const unsigned int& val) {
return hc::atomic_compare_exchange_unsigned(dest, compare, val);
}
KOKKOS_INLINE_FUNCTION
int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val) {
return (int64_t) hc::atomic_compare_exchange_uint64((uint64_t*)dest, (uint64_t)compare, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
uint64_t atomic_compare_exchange(uint64_t* dest, uint64_t compare, const uint64_t& val) {
return hc::atomic_compare_exchange_uint64(dest, compare, val);
}
KOKKOS_INLINE_FUNCTION
long long atomic_compare_exchange(long long* dest, long long compare, const long long& val) {
return (long long)hc::atomic_compare_exchange_uint64((uint64_t*)(dest), (uint64_t)(compare), (const uint64_t&)(val));
}
KOKKOS_INLINE_FUNCTION
float atomic_compare_exchange(float* dest, float compare, const float& val) {
union U {
int i ;
float f ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,icompare,ival;
idest.f = *dest;
icompare.f = compare;
ival.f = val;
idest.i = hc::atomic_compare_exchange_int(reinterpret_cast<int*>(dest), icompare.i, ival.i);
return idest.f;
}
KOKKOS_INLINE_FUNCTION
double atomic_compare_exchange(double* dest, double compare, const double& val) {
union U {
uint64_t i ;
double d ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,icompare,ival;
idest.d = *dest;
icompare.d = compare;
ival.d = val;
idest.i = hc::atomic_compare_exchange_uint64(reinterpret_cast<uint64_t*>(dest), icompare.i, ival.i);
return idest.d;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
union U {
int i ;
T f ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,icompare,ival;
idest.f = *dest;
icompare.f = compare;
ival.f = val;
idest.i = hc::atomic_compare_exchange_int((int*)(dest), icompare.i, ival.i);
return idest.f;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<sizeof(T) == sizeof(int64_t), const T&>::type val) {
union U {
uint64_t i ;
T f ;
KOKKOS_INLINE_FUNCTION U() {};
} idest,icompare,ival;
idest.f = *dest;
icompare.f = compare;
ival.f = val;
idest.i = hc::atomic_compare_exchange_uint64((uint64_t*)(dest), icompare.i, ival.i);
return idest.f;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<(sizeof(T) != sizeof(int32_t)) && (sizeof(T) != sizeof(int64_t)), const T&>::type val) {
return val;
}
KOKKOS_INLINE_FUNCTION
int atomic_fetch_add (volatile int * dest, const int& val) {
return hc::atomic_fetch_add((int *)dest, val);
}
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_add(unsigned int* dest, const unsigned int& val) {
return hc::atomic_fetch_add(dest, val);
}
KOKKOS_INLINE_FUNCTION
unsigned long atomic_fetch_add(volatile unsigned long* dest, const unsigned long& val) {
return (unsigned long)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t)val);
}
KOKKOS_INLINE_FUNCTION
int64_t atomic_fetch_add(volatile int64_t* dest, const int64_t& val) {
return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
char atomic_fetch_add(volatile char * dest, const char& val) {
unsigned int oldval,newval,assume;
oldval = *(int *)dest ;
do {
assume = oldval ;
newval = assume&0x7fffff00 + ((assume&0xff)+val)&0xff ;
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
} while ( assume != oldval );
return oldval ;
}
KOKKOS_INLINE_FUNCTION
short atomic_fetch_add(volatile short * dest, const short& val) {
unsigned int oldval,newval,assume;
oldval = *(int *)dest ;
do {
assume = oldval ;
newval = assume&0x7fff0000 + ((assume&0xffff)+val)&0xffff ;
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
} while ( assume != oldval );
return oldval ;
}
KOKKOS_INLINE_FUNCTION
long long atomic_fetch_add(volatile long long * dest, const long long& val) {
return (long long)hc::atomic_fetch_add((uint64_t*)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
int atomic_fetch_sub (volatile int * dest, const int& val) {
return hc::atomic_fetch_sub((int *)dest, val);
}
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_sub(volatile unsigned int* dest, const unsigned int& val) {
return hc::atomic_fetch_sub((unsigned int *)dest, val);
}
KOKKOS_INLINE_FUNCTION
int64_t atomic_fetch_sub(int64_t* dest, const int64_t& val) {
return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, -(const uint64_t&)val);
// return (int64_t)hc::atomic_fetch_sub_uint64((uint64_t*)dest, (const uint64_t&)val);
}
KOKKOS_INLINE_FUNCTION
char atomic_fetch_sub(volatile char * dest, const char& val) {
unsigned int oldval,newval,assume;
oldval = *(int *)dest ;
do {
assume = oldval ;
newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
} while ( assume != oldval );
return oldval ;
}
KOKKOS_INLINE_FUNCTION
short atomic_fetch_sub(volatile short * dest, const short& val) {
unsigned int oldval,newval,assume;
oldval = *(int *)dest ;
do {
assume = oldval ;
newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
} while ( assume != oldval );
return oldval ;
}
KOKKOS_INLINE_FUNCTION
long long atomic_fetch_sub(volatile long long * dest, const long long& val) {
return (long long)hc::atomic_fetch_add((uint64_t*)dest, -(const uint64_t&)val);
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int), const T&>::type val) {
union U {
unsigned int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = atomic_compare_exchange( (unsigned int*)(dest) , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
union U {
uint64_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = atomic_compare_exchange( (uint64_t*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
//WORKAROUND
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) != sizeof(int64_t), const T&>::type val) {
return val ;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) == sizeof(int),T>::type & val) {
union U {
int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template<class T>
KOKKOS_INLINE_FUNCTION
T atomic_fetch_sub(volatile T* dest, typename std::enable_if<sizeof(T) != sizeof(int) && sizeof(T) == sizeof(int64_t), const T&>::type val) {
union U {
int64_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = atomic_compare_exchange( (int64_t*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
}
#endif

View File

@ -0,0 +1,51 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
#define GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
#ifndef KOKKOS_ROCM_HAS_WORKAROUNDS
#define KOKKOS_ROCM_HAS_WORKAROUNDS 1
#endif
#endif

View File

@ -0,0 +1,133 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCMEXEC_HPP
#define KOKKOS_ROCMEXEC_HPP
#include <algorithm>
#include <typeinfo>
#include <Kokkos_Macros.hpp>
//#include <ROCm/Kokkos_ROCmExec.hpp>
#include <hc.hpp>
#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
#define ROCM_CONCURRENCY 20480
//#define ROCM_CONCURRENCY 81920 # for fiji
namespace Kokkos {
static int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
static int rocm_space_scratch_locks[ROCM_CONCURRENCY];
static int rocm_space_threadid_locks[ROCM_CONCURRENCY];
namespace Impl {
// TODO: mimic cuda implemtation, add dgpu capability
void init_rocm_atomic_lock_array() {
static int is_initialized = 0;
if(!is_initialized)
{
for(int i = 0; i < ROCM_SPACE_ATOMIC_MASK+1; i++)
rocm_space_atomic_locks[i] = 0;
is_initialized = 1;
}
}
void init_rocm_scratch_lock_array() {
static int is_initialized = 0;
if(!is_initialized)
{
for(int i = 0; i < ROCM_CONCURRENCY; i++)
rocm_space_scratch_locks[i] = 0;
is_initialized = 1;
}
}
void init_rocm_threadid_lock_array() {
static int is_initialized = 0;
if(!is_initialized)
{
for(int i = 0; i < ROCM_CONCURRENCY; i++)
rocm_space_threadid_locks[i] = 0;
is_initialized = 1;
}
}
void init_lock_arrays_rocm_space() {
init_rocm_atomic_lock_array();
// init_rocm_scratch_lock_array();
// init_rocm_threadid_lock_array();
}
}
} // namespace Kokkos
#if 0
namespace Kokkos {
namespace Impl {
KOKKOS_INLINE_FUNCTION
bool lock_address_rocm_space(void* ptr) {
#if 0
return(Kokkos::Impl::lock_address_host_space(ptr));
#else
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & ROCM_SPACE_ATOMIC_MASK;
return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
#endif
}
KOKKOS_INLINE_FUNCTION
void unlock_address_rocm_space(void* ptr) {
#if 0
Kokkos::Impl::unlock_address_host_space(ptr) ;
#else
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & ROCM_SPACE_ATOMIC_MASK;
hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
#endif
}
}
} // namespace Kokkos
#endif
#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */

View File

@ -0,0 +1,137 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCMEXEC_HPP
#define KOKKOS_ROCMEXEC_HPP
#include <algorithm>
#include <typeinfo>
#if defined(__HCC_ACCELERATOR__)
#define printf(...)
#endif
namespace Kokkos {
namespace Impl {
struct ROCmTraits {
// TODO: determine if needed
enum { WavefrontSize = 64 /* 64 */ };
enum { WorkgroupSize = 64 /* 64 */ };
enum { WavefrontIndexMask = 0x001f /* Mask for warpindex */ };
enum { WavefrontIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
enum { UpperBoundExtentCount = 65535 /* Hard upper bound */ };
#if 0
KOKKOS_INLINE_FUNCTION static
ROCmSpace::size_type wavefront_count( ROCmSpace::size_type i )
{ return ( i + WavefrontIndexMask ) >> WavefrontIndexShift ; }
KOKKOS_INLINE_FUNCTION static
ROCmSpace::size_type wavefront_align( ROCmSpace::size_type i )
{
enum { Mask = ~ROCmSpace::size_type( WavefrontIndexMask ) };
return ( i + WavefrontIndexMask ) & Mask ;
}
#endif
};
size_t rocm_internal_cu_count();
size_t rocm_internal_maximum_workgroup_count();
size_t * rocm_internal_scratch_flags( const size_t size );
size_t * rocm_internal_scratch_space( const size_t size );
}
} // namespace Kokkos
#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
//int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
extern int
*rocm_space_atomic_locks;
namespace Kokkos {
namespace Impl {
void init_lock_arrays_rocm_space();
void* rocm_resize_scratch_space(size_t bytes, bool force_shrink = false);
// TODO: determine if needed
KOKKOS_INLINE_FUNCTION
bool lock_address_rocm_space(void* ptr) {
#if 0
return(Kokkos::Impl::lock_address_host_space(ptr));
#else
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & ROCM_SPACE_ATOMIC_MASK;
return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
#endif
}
KOKKOS_INLINE_FUNCTION
void unlock_address_rocm_space(void* ptr) {
#if 0
Kokkos::Impl::unlock_address_host_space(ptr) ;
#else
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & ROCM_SPACE_ATOMIC_MASK;
hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
#endif
}
}
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
//extern
//KOKKOS_INLINE_FUNCTION
//void init_lock_arrays_rocm_space();
}
} // namespace Kokkos
#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */

View File

@ -0,0 +1,753 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/*--------------------------------------------------------------------------*/
/* Kokkos interfaces */
#include <Kokkos_Core.hpp>
/* only compile this file if ROCM is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_ROCM
//#include <ROCm/Kokkos_ROCm_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Kokkos_ROCmSpace.hpp>
#include <ROCm/Kokkos_ROCm_Exec.hpp>
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
#include <stdlib.h>
/* Standard 'C++' libraries */
#include <vector>
#include <iostream>
#include <sstream>
#include <string>
//KOKKOS_INLINE_FUNCTION
// Kokkos::Impl::ROCmLockArraysStruct kokkos_impl_rocm_lock_arrays ;
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
#if 0
namespace {
__global__
void query_rocm_kernel_arch( int * d_arch )
{
#if defined( __HCC_ACCELERATOR__ )
*d_arch = OCM_ARCH__ ;
#else
*d_arch = 0 ;
#endif
}
/** Query what compute capability is actually launched to the device: */
int rocm_kernel_arch()
{
int * d_arch = 0 ;
rocmMalloc( (void **) & d_arch , sizeof(int) );
query_rocm_kernel_arch<<<1,1>>>( d_arch );
int arch = 0 ;
rocmMemcpy( & arch , d_arch , sizeof(int) , rocmMemcpyDefault );
rocmFree( d_arch );
return arch ;
}
bool rocm_launch_blocking()
{
const char * env = getenv("ROCM_LAUNCH_BLOCKING");
if (env == 0) return false;
return atoi(env);
}
}
#endif
// true device memory allocation, not visible from host
void * rocm_device_allocate(int size)
{
void * ptr;
hc::accelerator acc;
ptr = hc::am_alloc(size,acc,0);
return ptr;
}
// host pinned allocation
// flag = 1, non-coherent, host resident, but with gpu address space pointer
// flag = 2, coherent, host resident, but with host address space pointer
void * rocm_hostpinned_allocate(int size)
{
void * ptr;
hc::accelerator acc;
ptr = hc::am_alloc(size,acc,2);
return ptr;
}
// same free used by all rocm memory allocations
void rocm_device_free(void * ptr)
{
hc::am_free(ptr);
}
KOKKOS_INLINE_FUNCTION
void rocm_device_synchronize()
{
hc::accelerator_view av = hc::accelerator().get_default_view();
hc::completion_future fut = av.create_marker();
fut.wait();
}
void rocm_internal_error_throw( const char * name, const char * file, const int line )
{
#if 0
std::ostringstream out ;
out << name << " error( " << rocmGetErrorName(e) << "): " << rocmGetErrorString(e);
if (file) {
out << " " << file << ":" << line;
}
throw_runtime_exception( out.str() );
#endif
}
//----------------------------------------------------------------------------
// Some significant rocm device properties:
//
// rocmDeviceProp::name : Text label for device
// rocmDeviceProp::major : Device major number
// rocmDeviceProp::minor : Device minor number
// rocmDeviceProp::workgroupSize : number of threads per workgroup
// rocmDeviceProp::multiProcessorCount : number of multiprocessors
// rocmDeviceProp::sharedMemPerBlock : capacity of shared memory per wavefront
// rocmDeviceProp::totalConstMem : capacity of constant memory
// rocmDeviceProp::totalGlobalMem : capacity of global memory
// rocmDeviceProp::maxGridSize[3] : maximum grid size
//
//
// the data we have available from a ROCm accelerator
// std::wstring get_device_path()
// std::wstring get_description()
// unsigned int get_version()
// bool get_has_display()
// size_t get_dedicated_memory()
// bool get_supports_double_precision()
// bool get_supports_limited_double_precision()
// bool get_is_debug()
// bool get_supports_cpu_shared_memory()
// size_t get_max_tile_static_size()
// unsigned int get_cu_count()
// bool has_cpu_accessible_am()
struct rocmDeviceProp {
char name[256];
char description[256];
unsigned int version;
int device_type;
int device_ordinal;
int major;
int minor;
size_t totalGlobalMem;
size_t sharedMemPerWavefront;
int WavefrontSize;
int WorkgroupSize;
int MaxTileCount;
int maxThreadsPerWorkgroup;
int multiProcessorCount;
int canMapHostMemory;
bool APU;
};
void rocmGetDeviceProperties(struct rocmDeviceProp* devProp, int device)
{
std::wstring s;
int i,n;
hc::accelerator acc;
std::vector<hc::accelerator> accv = acc.get_all() ;
hc::accelerator a = accv[device];
s=a.get_device_path();
i = 0;
for(wchar_t c: s)
if((n=std::wctomb(&devProp->name[i],c))>0)
i+=n;
/* assume a CPU */
devProp->version = a.get_version();
devProp->major = a.get_version()>>16; // for CPU, these are meaningless
devProp->minor = a.get_version()&0xff;
devProp->device_ordinal = 0;
/* is this an AMD graphics card */
if((devProp->name[0]=='g') && (devProp->name[1]=='f')
&& (devProp->name[2]=='x')) {
/* for AMD cards, the name has the format gfxMmmO */
devProp->device_type = ((devProp->name[3]-0x30)<<16)
+ ((devProp->name[4]-0x30)<<8)
+ (devProp->name[5]-0x30);
devProp->device_ordinal = devProp->name[6]-0x30;
devProp->major = devProp->name[3]-0x30;
devProp->minor = devProp->name[5]-0x30;
}
s=a.get_description();
i = 0;
for(wchar_t c: s)
if((n=std::wctomb(&devProp->description[i],c))>0)
i+=n;
devProp->totalGlobalMem = a.get_dedicated_memory();
devProp->sharedMemPerWavefront = a.get_max_tile_static_size();
devProp->WavefrontSize = 64;
devProp->WorkgroupSize = 256; // preferred
devProp->MaxTileCount = 409600; // as defined in /opt/rocm/hcc-lc/include/hsa_new.h
devProp->maxThreadsPerWorkgroup = 1024;
devProp->multiProcessorCount = a.get_cu_count();
devProp->canMapHostMemory = a.get_supports_cpu_shared_memory();
// Kaveri has 64KB L2 per CU, 16KB L1, 64KB Vector Regs/SIMD, or 128 regs/thread
// GCN has 64KB LDS per CU
//Kaveri APU is 7:0:0
//Carrizo APU is 8:0:1
devProp->APU = (((devProp->major==7)&&(devProp->minor==0))|
((devProp->major==8)&&(devProp->minor==1)))?true:false;
}
namespace {
class ROCmInternalDevices {
public:
enum { MAXIMUM_DEVICE_COUNT = 64 };
struct rocmDeviceProp m_rocmProp[ MAXIMUM_DEVICE_COUNT ] ;
int m_rocmDevCount ;
ROCmInternalDevices();
static const ROCmInternalDevices & singleton();
};
ROCmInternalDevices::ROCmInternalDevices()
{
hc::accelerator acc;
std::vector<hc::accelerator> accv = acc.get_all() ;
m_rocmDevCount = accv.size();
if(m_rocmDevCount > MAXIMUM_DEVICE_COUNT) {
Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
}
for ( int i = 0 ; i < m_rocmDevCount ; ++i ) {
rocmGetDeviceProperties( m_rocmProp + i , i );
}
}
const ROCmInternalDevices & ROCmInternalDevices::singleton()
{
static ROCmInternalDevices* self = nullptr;
if (!self) {
self = new ROCmInternalDevices();
}
return *self;
}
}
//----------------------------------------------------------------------------
class ROCmInternal {
private:
ROCmInternal( const ROCmInternal & );
ROCmInternal & operator = ( const ROCmInternal & );
public:
typedef Kokkos::Experimental::ROCm::size_type size_type ;
int m_rocmDev ;
int m_rocmArch ;
unsigned m_multiProcCount ;
unsigned m_maxWorkgroup ;
unsigned m_maxSharedWords ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
static int was_finalized;
static ROCmInternal & singleton();
int verify_is_initialized( const char * const label ) const ;
int is_initialized() const
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
void initialize( int rocm_device_id );
void finalize();
void print_configuration( std::ostream & ) const ;
~ROCmInternal();
ROCmInternal()
: m_rocmDev( -1 )
, m_rocmArch( -1 )
, m_multiProcCount( 0 )
, m_maxWorkgroup( 0 )
, m_maxSharedWords( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
{}
size_type * scratch_space( const size_type size );
size_type * scratch_flags( const size_type size );
};
int ROCmInternal::was_finalized = 0;
//----------------------------------------------------------------------------
void ROCmInternal::print_configuration( std::ostream & s ) const
{
const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
#if defined( KOKKOS_ENABLE_ROCM )
s << "macro KOKKOS_ENABLE_ROCM : defined" << std::endl ;
#endif
#if defined( __hcc_version__ )
s << "macro __hcc_version__ = " << __hcc_version__
<< std::endl ;
#endif
for ( int i = 0 ; i < dev_info.m_rocmDevCount ; ++i ) {
s << "Kokkos::Experimental::ROCm[ " << i << " ] "
<< dev_info.m_rocmProp[i].name
<< " version " << (dev_info.m_rocmProp[i].major) << "." << dev_info.m_rocmProp[i].minor
<< ", Total Global Memory: " << human_memory_size(dev_info.m_rocmProp[i].totalGlobalMem)
<< ", Shared Memory per Wavefront: " << human_memory_size(dev_info.m_rocmProp[i].sharedMemPerWavefront);
if ( m_rocmDev == i ) s << " : Selected" ;
s << std::endl ;
}
}
//----------------------------------------------------------------------------
ROCmInternal::~ROCmInternal()
{
if ( m_scratchSpace ||
m_scratchFlags ) {
std::cerr << "Kokkos::Experimental::ROCm ERROR: Failed to call Kokkos::Experimental::ROCm::finalize()"
<< std::endl ;
std::cerr.flush();
}
m_rocmDev = -1 ;
m_rocmArch = -1 ;
m_multiProcCount = 0 ;
m_maxWorkgroup = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
}
int ROCmInternal::verify_is_initialized( const char * const label ) const
{
if ( m_rocmDev < 0 ) {
std::cerr << "Kokkos::Experimental::ROCm::" << label << " : ERROR device not initialized" << std::endl ;
}
return 0 <= m_rocmDev ;
}
ROCmInternal & ROCmInternal::singleton()
{
static ROCmInternal* self = nullptr ;
if (!self) {
self = new ROCmInternal();
}
return *self ;
}
void ROCmInternal::initialize( int rocm_device_id )
{
if ( was_finalized ) Kokkos::abort("Calling ROCm::initialize after ROCm::finalize is illegal\n");
if ( is_initialized() ) return;
enum { WordSize = sizeof(size_type) };
if ( ! HostSpace::execution_space::is_initialized() ) {
const std::string msg("ROCm::initialize ERROR : HostSpace::execution_space is not initialized");
throw_runtime_exception( msg );
}
const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
const bool ok_id = 1 <= rocm_device_id &&
rocm_device_id < dev_info.m_rocmDevCount ;
// Need at least a GPU device
const bool ok_dev = ok_id &&
( 1 <= dev_info.m_rocmProp[ rocm_device_id ].major &&
0 <= dev_info.m_rocmProp[ rocm_device_id ].minor );
if ( ok_init && ok_dev ) {
const struct rocmDeviceProp & rocmProp =
dev_info.m_rocmProp[ rocm_device_id ];
m_rocmDev = rocm_device_id ;
// rocmSetDevice( m_rocmDev ) );
Kokkos::Impl::rocm_device_synchronize();
/*
// Query what compute capability architecture a kernel executes:
m_rocmArch = rocm_kernel_arch();
if ( m_rocmArch != rocmProp.major * 100 + rocmProp.minor * 10 ) {
std::cerr << "Kokkos::Experimental::ROCm::initialize WARNING: running kernels compiled for compute capability "
<< ( m_rocmArch / 100 ) << "." << ( ( m_rocmArch % 100 ) / 10 )
<< " on device with compute capability "
<< rocmProp.major << "." << rocmProp.minor
<< " , this will likely reduce potential performance."
<< std::endl ;
}
*/
// number of multiprocessors
m_multiProcCount = rocmProp.multiProcessorCount ;
//----------------------------------
// Maximum number of wavefronts,
// at most one workgroup per thread in a workgroup for reduction.
m_maxSharedWords = rocmProp.sharedMemPerWavefront/ WordSize ;
//----------------------------------
// Maximum number of Workgroups:
m_maxWorkgroup = 5*rocmProp.multiProcessorCount; //TODO: confirm usage and value
//----------------------------------
// Multiblock reduction uses scratch flags for counters
// and scratch space for partial reduction values.
// Allocate some initial space. This will grow as needed.
{
const unsigned reduce_block_count = m_maxWorkgroup * Impl::ROCmTraits::WorkgroupSize ;
(void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) );
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
}
//----------------------------------
}
else {
std::ostringstream msg ;
msg << "Kokkos::Experimental::ROCm::initialize(" << rocm_device_id << ") FAILED" ;
if ( ! ok_init ) {
msg << " : Already initialized" ;
}
if ( ! ok_id ) {
msg << " : Device identifier out of range "
<< "[0.." << (dev_info.m_rocmDevCount-1) << "]" ;
}
else if ( ! ok_dev ) {
msg << " : Device " ;
msg << dev_info.m_rocmProp[ rocm_device_id ].major ;
msg << "." ;
msg << dev_info.m_rocmProp[ rocm_device_id ].minor ;
msg << " Need at least a GPU" ;
msg << std::endl;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
// Init the array for used for arbitrarily sized atomics
Kokkos::Impl::init_lock_arrays_rocm_space();
// Kokkos::Impl::ROCmLockArraysStruct locks;
// locks.atomic = atomic_lock_array_rocm_space_ptr(false);
// locks.scratch = scratch_lock_array_rocm_space_ptr(false);
// locks.threadid = threadid_lock_array_rocm_space_ptr(false);
// rocmMemcpyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
}
//----------------------------------------------------------------------------
typedef Kokkos::Experimental::ROCm::size_type ScratchGrain[ Impl::ROCmTraits::WorkgroupSize ] ;
enum { sizeScratchGrain = sizeof(ScratchGrain) };
void rocmMemset( Kokkos::Experimental::ROCm::size_type * ptr , Kokkos::Experimental::ROCm::size_type value , Kokkos::Experimental::ROCm::size_type size)
{
char * mptr = (char * ) ptr;
#if 0
parallel_for_each(hc::extent<1>(size),
[=, &ptr]
(hc::index<1> idx) __HC__
{
int i = idx[0];
ptr[i] = value;
}).wait();
#else
for (int i= 0; i<size ; i++)
{
mptr[i] = (char) value;
}
#endif
}
Kokkos::Experimental::ROCm::size_type *
ROCmInternal::scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
{
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::HostSpace()
, "InternalScratchFlags"
, ( sizeScratchGrain * m_scratchFlagsCount ) );
Record::increment( r );
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
rocmMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain );
}
return m_scratchFlags ;
}
Kokkos::Experimental::ROCm::size_type *
ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
{
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::HostSpace()
, "InternalScratchSpace"
, ( sizeScratchGrain * m_scratchSpaceCount ) );
Record::increment( r );
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
}
return m_scratchSpace ;
}
//----------------------------------------------------------------------------
void ROCmInternal::finalize()
{
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
// atomic_lock_array_rocm_space_ptr(false);
// scratch_lock_array_rocm_space_ptr(false);
// threadid_lock_array_rocm_space_ptr(false);
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< HostSpace > RecordROCm ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace > RecordHost ;
RecordROCm::decrement( RecordROCm::get_record( m_scratchFlags ) );
RecordROCm::decrement( RecordROCm::get_record( m_scratchSpace ) );
m_rocmDev = -1 ;
m_multiProcCount = 0 ;
m_maxWorkgroup = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
}
}
//----------------------------------------------------------------------------
Kokkos::Experimental::ROCm::size_type rocm_internal_cu_count()
{ return ROCmInternal::singleton().m_multiProcCount ; }
Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_extent_size()
{ return ROCmInternal::singleton().m_maxWorkgroup ; }
Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_shared_words()
{ return ROCmInternal::singleton().m_maxSharedWords ; }
Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_space( const Kokkos::Experimental::ROCm::size_type size )
{ return ROCmInternal::singleton().scratch_space( size ); }
Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
{ return ROCmInternal::singleton().scratch_flags( size ); }
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
//ROCm::size_type ROCm::detect_device_count()
//{ return Impl::ROCmInternalDevices::singleton().m_rocmDevCount ; }
int ROCm::concurrency() {
#if defined(KOKKOS_ARCH_KAVERI)
return 8*64*40; // 20480 kaveri
#else
return 32*8*40; // 81920 fiji and hawaii
#endif
}
int ROCm::is_initialized()
{ return Kokkos::Impl::ROCmInternal::singleton().is_initialized(); }
void ROCm::initialize( const ROCm::SelectDevice config )
{
Kokkos::Impl::ROCmInternal::singleton().initialize( config.rocm_device_id );
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
#if 0
std::vector<unsigned>
ROCm::detect_device_arch()
{
const Impl::ROCmInternalDevices & s = Impl::ROCmInternalDevices::singleton();
std::vector<unsigned> output( s.m_rocmDevCount );
for ( int i = 0 ; i < s.m_rocmDevCount ; ++i ) {
output[i] = s.m_rocmProp[i].major * 100 + s.m_rocmProp[i].minor ;
}
return output ;
}
ROCm::size_type ROCm::device_arch()
{
return 1 ;
}
#endif
void ROCm::finalize()
{
Kokkos::Impl::ROCmInternal::singleton().finalize();
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
ROCm::ROCm()
: m_device( Kokkos::Impl::ROCmInternal::singleton().m_rocmDev )
{
Kokkos::Impl::ROCmInternal::singleton().verify_is_initialized( "ROCm instance constructor" );
}
bool ROCm::isAPU(int device) {
const Kokkos::Impl::ROCmInternalDevices & dev_info =
Kokkos::Impl::ROCmInternalDevices::singleton();
return (dev_info.m_rocmProp[device].APU);
}
bool ROCm::isAPU() {
return ROCm::isAPU(rocm_device());
}
//ROCm::ROCm( const int instance_id )
// : m_device( Impl::ROCmInternal::singleton().m_rocmDev )
//{}
void ROCm::print_configuration( std::ostream & s , const bool )
{ Kokkos::Impl::ROCmInternal::singleton().print_configuration( s ); }
bool ROCm::sleep() { return false ; }
bool ROCm::wake() { return true ; }
void ROCm::fence()
{
Kokkos::Impl::rocm_device_synchronize();
}
const char* ROCm::name() { return "ROCm"; }
} // namespace Experimental
} // namespace Kokkos
#endif // KOKKOS_ENABLE_ROCM
//----------------------------------------------------------------------------

View File

@ -0,0 +1,138 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <type_traits>
#include <Kokkos_Macros.hpp>
#if !defined( KOKKOS_ROCM_INVOKE_H )
#define KOKKOS_ROCM_INVOKE_H
namespace Kokkos {
namespace Impl {
template<class Tag, class F, class... Ts, typename std::enable_if<(!std::is_void<Tag>()), int>::type = 0>
KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
{
f(Tag(), static_cast<Ts&&>(xs)...);
}
template<class Tag, class F, class... Ts, typename std::enable_if<(std::is_void<Tag>()), int>::type = 0>
KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
{
f(static_cast<Ts&&>(xs)...);
}
template<class F, class Tag=void>
struct rocm_invoke_fn
{
F* f;
rocm_invoke_fn(F& f_) : f(&f_)
{}
template<class... Ts>
KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
{
rocm_invoke<Tag>(*f, static_cast<Ts&&>(xs)...);
}
};
template<class Tag, class F>
KOKKOS_INLINE_FUNCTION rocm_invoke_fn<F, Tag> make_rocm_invoke_fn(F& f)
{
return {f};
}
template<class T>
KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x)
{
return x;
}
template<class T>
KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper<T> x)
{
return x;
}
template<class F, class T>
struct rocm_capture_fn
{
F f;
T data;
KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x)
: f(f_), data(x)
{}
template<class... Ts>
KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
{
f(rocm_unwrap(data), static_cast<Ts&&>(xs)...);
}
};
template<class F, class T>
KOKKOS_INLINE_FUNCTION rocm_capture_fn<F, T> rocm_capture(F f, T x)
{
return {f, x};
}
template<class F, class T, class U, class... Ts>
KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) -> decltype(rocm_capture(rocm_capture(f, x), y, xs...))
{
return rocm_capture(rocm_capture(f, x), y, xs...);
}
struct rocm_apply_op
{
template<class F, class... Ts>
KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const
{
f(static_cast<Ts&&>(xs)...);
}
};
}}
#endif

View File

@ -0,0 +1,72 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if !defined( KOKKOS_ROCM_JOIN_H )
#define KOKKOS_ROCM_JOIN_H
namespace Kokkos {
namespace Impl {
// Adaptor to use ValueJoin with standard algorithms
template<class Joiner, class F>
struct join_operator
{
const F* fp;
template<class T, class U>
T operator()(T x, const U& y) const
{
Joiner::join(*fp, &x, &y);
return x;
}
};
template<class Joiner, class F>
join_operator<Joiner, F> make_join_operator(const F& f)
{
return join_operator<Joiner, F>{&f};
}
}}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,193 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
///////////////////////////////////////////////////////////////////////////////
// AMP REDUCE
//////////////////////////////////////////////////////////////////////////////
#if !defined( KOKKOS_ROCM_AMP_REDUCE_INL )
#define KOKKOS_ROCM_AMP_REDUCE_INL
#include <iostream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <type_traits>
#include <ROCm/Kokkos_ROCm_Tile.hpp>
#include <ROCm/Kokkos_ROCm_Invoke.hpp>
#include <ROCm/Kokkos_ROCm_Join.hpp>
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
namespace Kokkos {
namespace Impl {
template<class T>
T* reduce_value(T* x, std::true_type) [[hc]]
{
return x;
}
template<class T>
T& reduce_value(T* x, std::false_type) [[hc]]
{
return *x;
}
#if KOKKOS_ROCM_HAS_WORKAROUNDS
struct always_true
{
template<class... Ts>
bool operator()(Ts&&...) const
{
return true;
}
};
#endif
template< class Tag, class F, class ReducerType, class Invoker, class T >
void reduce_enqueue(
const int szElements, // size of the extent
const F & f,
const ReducerType& reducer,
Invoker invoke,
T * const output_result,
int const output_length,
const int team_size=64,
const int vector_size=1,
int const shared_size=0)
{
using namespace hc ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, F, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , Tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , Tag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , Tag > ValueJoin ;
typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , Tag > ValueFinal ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
if (output_length < 1) return;
assert(output_result != nullptr);
const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
// allocate host and device memory for the results from each team
std::vector<T> result_cpu(td.num_tiles*output_length);
hc::array<T> result(td.num_tiles*output_length);
auto fut = tile_for<T[]>(td, [=,&result](hc::tiled_index<1> t_idx, tile_buffer<T[]> buffer) [[hc]]
{
const auto local = t_idx.local[0];
const auto global = t_idx.global[0];
const auto tile = t_idx.tile[0];
buffer.action_at(local, [&](T* state)
{
ValueInit::init(ReducerConditional::select(f, reducer), state);
invoke(make_rocm_invoke_fn<Tag>(f), t_idx, td, reduce_value(state, std::is_pointer<reference_type>()));
});
t_idx.barrier.wait();
// Reduce within a tile using multiple threads.
// even though buffer.size is always 64, the value 64 must be hard coded below
// due to a compiler bug
// for(std::size_t s = 1; s < buffer.size(); s *= 2)
for(std::size_t s = 1; s < 64; s *= 2)
{
const std::size_t index = 2 * s * local;
// if (index < buffer.size())
if (index < 64)
{
buffer.action_at(index, index + s, [&](T* x, T* y)
{
ValueJoin::join(ReducerConditional::select(f, reducer), x, y);
});
}
t_idx.barrier.wait();
}
// Store the tile result in the global memory.
if (local == 0)
{
#if KOKKOS_ROCM_HAS_WORKAROUNDS
// Workaround for assigning from LDS memory: std::copy should work
// directly
buffer.action_at(0, [&](T* x)
{
#if ROCM15
// new ROCM 15 address space changes aren't implemented in std algorithms yet
auto * src = reinterpret_cast<char *>(x);
auto * dest = reinterpret_cast<char *>(result.data()+tile*output_length);
for(int i=0; i<sizeof(T);i++) dest[i] = src[i];
#else
// Workaround: copy_if used to avoid memmove
std::copy_if(x, x+output_length, result.data()+tile*output_length, always_true{} );
#endif
});
#else
std::copy(buffer, buffer+output_length, result.data()+tile*output_length);
#endif
}
});
ValueInit::init(ReducerConditional::select(f, reducer), output_result);
fut.wait();
copy(result,result_cpu.data());
for(std::size_t i=0;i<td.num_tiles;i++)
ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
}
}} //end of namespace Kokkos::Impl
#endif /* #if !defined( KOKKOS_ROCM_AMP_REDUCE_INL ) */

View File

@ -0,0 +1,605 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCM_REDUCESCAN_HPP
#define KOKKOS_ROCM_REDUCESCAN_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if ROCM is enabled for Kokkos */
#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
//#include <utility>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Error.hpp>
#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl( T & out , T const & in , int lane ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl( *reinterpret_cast<int const *>(&in) , lane , width );
}
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl( T & out , T const & in , int lane ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
}
}
//----------------------------------------------------------------------------
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl_down( T & out , T const & in , int delta ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
}
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl_down( T & out , T const & in , int delta ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
}
}
//----------------------------------------------------------------------------
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl_up( T & out , T const & in , int delta ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
}
template< typename T >
KOKKOS_INLINE_FUNCTION
void rocm_shfl_up( T & out , T const & in , int delta ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
}
}
#if 0
//----------------------------------------------------------------------------
/** \brief Reduce within a workgroup over team.vector_length(), the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Requires:
* team.vector_length() is power of 2
* team.vector_length() <= 32 (one workgroup)
*
* Cannot use "butterfly" pattern because floating point
* addition is non-associative. Therefore, must broadcast
* the final result.
*/
template< class Reducer >
KOKKOS_INLINE_FUNCTION
void rocm_intra_workgroup_vector_reduce( Reducer const & reducer )
{
static_assert(
std::is_reference< typename Reducer::reference_type >::value , "" );
if ( 1 < team.vector_length() ) {
typename Reducer::value_type tmp ;
for ( int i = team.vector_length() ; ( i >>= 1 ) ; ) {
rocm_shfl_down( tmp , reducer.reference() , i , team.vector_length() );
if ( team.vector_rank() < i ) { reducer.join( reducer.data() , & tmp ); }
}
// Broadcast from root "lane" to all other "lanes"
rocm_shfl( reducer.reference() , reducer.reference() , 0 , team.vector_length() );
}
}
/** \brief Inclusive scan over team.vector_length(), the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Algorithm is concurrent bottom-up reductions in triangular pattern
* where each ROCM thread is the root of a reduction tree from the
* zeroth ROCM thread to itself.
*
* Requires:
* team.vector_length() is power of 2
* team.vector_length() <= 32 (one workgroup)
*/
template< typename ValueType >
KOKKOS_INLINE_FUNCTION
void rocm_intra_workgroup_vector_inclusive_scan( ValueType & local )
{
ValueType tmp ;
// Bottom up:
// [t] += [t-1] if t >= 1
// [t] += [t-2] if t >= 2
// [t] += [t-4] if t >= 4
// ...
for ( int i = 1 ; i < team.vector_length() ; i <<= 1 ) {
rocm_shfl_up( tmp , local , i , team.vector_length() );
if ( i <= team.vector_rank() ) { local += tmp ; }
}
}
#endif
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) threads with same team.team_rank() have same value
* (b) team.vector_length() == power of two
* (c) blockDim.z == 1
*/
template< class ValueType , class JoinOp>
KOKKOS_INLINE_FUNCTION
void rocm_intra_workgroup_reduction( const ROCmTeamMember& team,
ValueType& result,
const JoinOp& join) {
unsigned int shift = 1;
int max_active_thread = team.team_size();
//Reduce over values from threads with different team.team_rank()
while(team.vector_length() * shift < 32 ) {
const ValueType tmp = shfl_down(result, team.vector_length()*shift,32u);
//Only join if upper thread is active (this allows non power of two for team.team_size()
if(team.team_rank() + shift < max_active_thread)
join(result , tmp);
shift*=2;
}
result = shfl(result,0,32);
}
template< class ValueType , class JoinOp>
KOKKOS_INLINE_FUNCTION
void rocm_inter_workgroup_reduction( const ROCmTeamMember& team,
ValueType& value,
const JoinOp& join) {
#define STEP_WIDTH 4
tile_static ValueType sh_result[256];
int max_active_thread = team.team_size();
ValueType* result = (ValueType*) & sh_result;
const unsigned step = 256 / team.vector_length();
unsigned shift = STEP_WIDTH;
const int id = team.team_rank()%step==0?team.team_rank()/step:65000;
if(id < STEP_WIDTH ) {
result[id] = value;
}
team.team_barrier();
while (shift<=max_active_thread/step) {
if(shift<=id && shift+STEP_WIDTH>id && team.vector_rank()==0) {
join(result[id%STEP_WIDTH],value);
}
team.team_barrier();
shift+=STEP_WIDTH;
}
value = result[0];
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
join(value,result[i]);
}
#if 0
template< class ValueType , class JoinOp>
KOKKOS_INLINE_FUNCTION
void rocm_intra_block_reduction( ROCmTeamMember& team,
ValueType& value,
const JoinOp& join,
const int max_active_thread) {
rocm_intra_workgroup_reduction(team,value,join,max_active_thread);
rocm_inter_workgroup_reduction(team,value,join,max_active_thread);
}
template< class FunctorType , class JoinOp , class ArgTag = void >
KOKKOS_INLINE_FUNCTION
bool rocm_inter_block_reduction( ROCmTeamMember& team,
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value,
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral,
const JoinOp& join,
ROCm::size_type * const m_scratch_space,
typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
ROCm::size_type * const m_scratch_flags,
const int max_active_thread) {
#ifdef __ROCM_ARCH__
typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
//Do the intra-block reduction with shfl operations and static shared memory
rocm_intra_block_reduction(value,join,max_active_thread);
const unsigned id = team.team_rank()*team.vector_length() + team.vector_rank();
//One thread in the block writes block result to global scratch_memory
if(id == 0 ) {
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
*global = value;
}
//One workgroup of last block performs inter block reduction through loading the block values from global scratch_memory
bool last_block = false;
team.team_barrier();
if ( id < 32 ) {
ROCm::size_type count;
//Figure out whether this is the last block
if(id == 0)
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
count = Kokkos::shfl(count,0,32);
//Last block does the inter block reduction
if( count == gridDim.x - 1) {
//set flag back to zero
if(id == 0)
*m_scratch_flags = 0;
last_block = true;
value = neutral;
pointer_type const volatile global = (pointer_type) m_scratch_space ;
//Reduce all global values with splitting work over threads in one workgroup
const int step_size = team.vector_length()*team.team_size() < 32 ? team.vector_length()*team.team_size() : 32;
for(int i=id; i<gridDim.x; i+=step_size) {
value_type tmp = global[i];
join(value, tmp);
}
//Perform shfl reductions within the workgroup only join if contribution is valid (allows gridDim.x non power of two and <32)
if (team.vector_length()*team.team_size() > 1) {
value_type tmp = Kokkos::shfl_down(value, 1,32);
if( id + 1 < gridDim.x )
join(value, tmp);
}
if (team.vector_length()*team.team_size() > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < gridDim.x )
join(value, tmp);
}
if (team.vector_length()*team.team_size() > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < gridDim.x )
join(value, tmp);
}
if (team.vector_length()*team.team_size() > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < gridDim.x )
join(value, tmp);
}
if (team.vector_length()*team.team_size() > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < gridDim.x )
join(value, tmp);
}
}
}
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
#else
return true;
#endif
}
#endif
#if 0
//----------------------------------------------------------------------------
// See section B.17 of ROCm C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize shared memory and minimize L1 cache:
// rocmFuncSetCacheConfig(MyKernel, rocmFuncCachePreferShared );
// For 2.0 capability: 48 KB shared and 16 KB L1
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) team.team_size() is a power of two
* (b) team.team_size() <= 512
* (c) team.vector_length() == blockDim.z == 1
*/
template< bool DoScan , class FunctorType , class ArgTag >
KOKKOS_INLINE_FUNCTION
void rocm_intra_block_reduce_scan( const FunctorType & functor ,
const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
{
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
const unsigned value_count = ValueTraits::value_count( functor );
const unsigned BlockSizeMask = team.team_size() - 1 ;
// Must have power of two thread count
if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_intra_block_scan requires power-of-two blockDim"); }
#define BLOCK_REDUCE_STEP( R , TD , S ) \
if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
#define BLOCK_SCAN_STEP( TD , N , S ) \
if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
const unsigned rtid_intra = team.team_rank() ^ BlockSizeMask ;
const pointer_type tdata_intra = base_data + value_count * team.team_rank() ;
{ // Intra-workgroup reduction:
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
}
team.team_barrier(); // Wait for all workgroups to reduce
{ // Inter-workgroup reduce-scan by a single workgroup to avoid extra synchronizations
const unsigned rtid_inter = ( team.team_rank() ^ BlockSizeMask ) << ROCmTraits::WarpIndexShift ;
if ( rtid_inter < team.team_size() ) {
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
if ( DoScan ) {
int n = ( rtid_inter & 32 ) ? 32 : (
( rtid_inter & 64 ) ? 64 : (
( rtid_inter & 128 ) ? 128 : (
( rtid_inter & 256 ) ? 256 : 0 )));
if ( ! ( rtid_inter + n < team.team_size() ) ) n = 0 ;
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,8)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,7)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,6)
__threadfence_block(); BLOCK_SCAN_STEP(tdata_inter,n,5)
}
}
}
team.team_barrier(); // Wait for inter-workgroup reduce-scan to complete
if ( DoScan ) {
int n = ( rtid_intra & 1 ) ? 1 : (
( rtid_intra & 2 ) ? 2 : (
( rtid_intra & 4 ) ? 4 : (
( rtid_intra & 8 ) ? 8 : (
( rtid_intra & 16 ) ? 16 : 0 ))));
if ( ! ( rtid_intra + n < team.team_size() ) ) n = 0 ;
#ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
BLOCK_SCAN_STEP(tdata_intra,n,4) team.team_barrier();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) team.team_barrier();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) team.team_barrier();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) team.team_barrier();//__threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0) team.team_barrier();
#else
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0) __threadfence_block();
#endif
}
#undef BLOCK_SCAN_STEP
#undef BLOCK_REDUCE_STEP
}
//----------------------------------------------------------------------------
/**\brief Input value-per-thread starting at 'shared_data'.
* Reduction value at last thread's location.
*
* If 'DoScan' then write blocks' scan values and block-groups' scan values.
*
* Global reduce result is in the last threads' 'shared_data' location.
*/
template< bool DoScan , class FunctorType , class ArgTag >
KOKKOS_INLINE_FUNCTION
bool rocm_single_inter_block_reduce_scan( const FunctorType & functor ,
const ROCm::size_type block_id ,
const ROCm::size_type block_count ,
ROCm::size_type * const shared_data ,
ROCm::size_type * const global_data ,
ROCm::size_type * const global_flags )
{
typedef ROCm::size_type size_type ;
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename ValueTraits::value_type value_type ;
// '__ffs' = position of the least significant bit set to 1.
// 'team.team_size()' is guaranteed to be a power of two so this
// is the integral shift value that can replace an integral divide.
const unsigned BlockSizeShift = __ffs( team.team_size() ) - 1 ;
const unsigned BlockSizeMask = team.team_size() - 1 ;
// Must have power of two thread count
if ( BlockSizeMask & team.team_size() ) { Kokkos::abort("ROCm::rocm_single_inter_block_reduce_scan requires power-of-two blockDim"); }
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
// Reduce the accumulation for the entire block.
rocm_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
{
// Write accumulation total to global scratch space.
// Accumulation total is the last thread's data.
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
#if (__ROCM_ARCH__ < 500)
for ( size_type i = team.team_rank() ; i < word_count.value ; i += team.team_size() ) { global[i] = shared[i] ; }
#else
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
#endif
}
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
// If this block is not the last block to contribute to this group then the block is done.
team.team_barrier();
const bool is_last_block =
! team.team_reduce( team.team_rank() ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) ,Impl::JoinAdd<ValueType>());
if ( is_last_block ) {
const size_type b = ( long(block_count) * long(team.team_rank()) ) >> BlockSizeShift ;
const size_type e = ( long(block_count) * long( team.team_rank() + 1 ) ) >> BlockSizeShift ;
{
void * const shared_ptr = shared_data + word_count.value * team.team_rank() ;
reference_type shared_value = ValueInit::init( functor , shared_ptr );
for ( size_type i = b ; i < e ; ++i ) {
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
}
}
rocm_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
if ( DoScan ) {
size_type * const shared_value = shared_data + word_count.value * ( team.team_rank() ? team.team_rank() - 1 : team.team_size() );
if ( ! team.team_rank() ) { ValueInit::init( functor , shared_value ); }
// Join previous inclusive scan value to each member
for ( size_type i = b ; i < e ; ++i ) {
size_type * const global_value = global_data + word_count.value * i ;
ValueJoin::join( functor , shared_value , global_value );
ValueOps ::copy( functor , global_value , shared_value );
}
}
}
return is_last_block ;
}
// Size in bytes required for inter block reduce or scan
template< bool DoScan , class FunctorType , class ArgTag >
inline
unsigned rocm_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
{
return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
}
#endif
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( __ROCMCC__ ) */
#endif /* KOKKOS_ROCM_REDUCESCAN_HPP */

View File

@ -0,0 +1,157 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <ROCm/Kokkos_ROCm_Invoke.hpp>
#include <ROCm/Kokkos_ROCm_Join.hpp>
namespace Kokkos {
namespace Impl {
template< class Tag, class F, class TransformIndex>
void scan_enqueue(
const int len,
const F & f,
TransformIndex transform_index)
{
typedef Kokkos::Impl::FunctorValueTraits< F, Tag> ValueTraits;
typedef Kokkos::Impl::FunctorValueInit< F, Tag> ValueInit;
typedef Kokkos::Impl::FunctorValueJoin< F, Tag> ValueJoin;
typedef Kokkos::Impl::FunctorValueOps< F, Tag> ValueOps;
typedef typename ValueTraits::value_type value_type;
typedef typename ValueTraits::pointer_type pointer_type;
typedef typename ValueTraits::reference_type reference_type;
const auto td = get_tile_desc<value_type>(len);
std::vector<value_type> result_cpu(td.num_tiles);
hc::array<value_type> result(td.num_tiles);
hc::array<value_type> scratch(len);
tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]]
{
const auto local = t_idx.local[0];
const auto global = t_idx.global[0];
const auto tile = t_idx.tile[0];
// Join tile buffer elements
const auto join = [&](std::size_t i, std::size_t j)
{
buffer.action_at(i, j, [&](value_type& x, const value_type& y)
{
ValueJoin::join(f, &x, &y);
});
};
// Copy into tile
buffer.action_at(local, [&](value_type& state)
{
ValueInit::init(f, &state);
if (global < len) rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), state, false);
});
t_idx.barrier.wait();
// Up sweep phase
for(std::size_t d=1;d<buffer.size();d*=2)
{
auto d2 = 2*d;
auto i = local*d2;
if(i<len)
{
auto j = i + d - 1;
auto k = i + d2 - 1;
// join(k, j); // no longer needed with ROCm 1.6
ValueJoin::join(f, &buffer[k], &buffer[j]);
}
}
t_idx.barrier.wait();
result[tile] = buffer[buffer.size()-1];
buffer[buffer.size()-1] = 0;
// Down sweep phase
for(std::size_t d=buffer.size()/2;d>0;d/=2)
{
auto d2 = 2*d;
auto i = local*d2;
if(i<len)
{
auto j = i + d - 1;
auto k = i + d2 - 1;
auto t = buffer[k];
// join(k, j); // no longer needed with ROCm 1.6
ValueJoin::join(f, &buffer[k], &buffer[j]);
buffer[j] = t;
}
t_idx.barrier.wait();
}
// Copy tiles into global memory
if (global < len) scratch[global] = buffer[local];
}).wait();
copy(result,result_cpu.data());
// The std::partial_sum was segfaulting, despite that this is cpu code.
// if(td.num_tiles>1)
// std::partial_sum(result_cpu.data(), result_cpu.data()+(td.num_tiles-1)*sizeof(value_type), result_cpu.data(), make_join_operator<ValueJoin>(f));
// use this implementation instead.
for(int i=1; i<td.num_tiles; i++)
ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
copy(result_cpu.data(),result);
hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]]
{
// const auto local = t_idx.local[0];
const auto global = t_idx.global[0];
const auto tile = t_idx.tile[0];
if (global < len)
{
auto final_state = scratch[global];
// the join is locking up, at least with 1.6
if (tile != 0) final_state += result[tile-1];
// if (tile != 0) ValueJoin::join(f, &final_state, &result[tile-1]);
rocm_invoke<Tag>(f, transform_index(t_idx, td.tile_size, td.num_tiles), final_state, true);
}
}).wait();
}
} // namespace Impl
} // namespace Kokkos

View File

@ -0,0 +1,726 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <atomic>
#include <Kokkos_Macros.hpp>
/* only compile this file if ROCM is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_ROCM
#include <Kokkos_Core.hpp>
#include <Kokkos_ROCm.hpp>
#include <Kokkos_ROCmSpace.hpp>
#include <impl/Kokkos_Error.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#define ROCM_SAFE_CALL
namespace Kokkos {
namespace Impl {
using namespace hc;
DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<HostSpace,Kokkos::Experimental::ROCmHostPinnedSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
DeepCopy<Kokkos::Experimental::ROCmHostPinnedSpace,HostSpace,Kokkos::Experimental::ROCm>::DeepCopy( const Kokkos::Experimental::ROCm & instance , void * dst , const void * src , size_t n )
{
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
av.copy( src , dst , n);
}
hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n) {
hc::accelerator acc;
hc::accelerator_view av = acc.get_default_view();
return(av.copy_async( src , dst , n));
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
void Experimental::ROCmSpace::access_error()
{
const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
Kokkos::Impl::throw_runtime_exception( msg );
}
void Experimental::ROCmSpace::access_error( const void * const )
{
const std::string msg("Kokkos::Experimental::ROCmSpace::access_error attempt to execute Experimental::ROCm function from non-ROCm space" );
Kokkos::Impl::throw_runtime_exception( msg );
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
ROCmSpace::ROCmSpace()
: m_device( ROCm().rocm_device() )
{
}
ROCmHostPinnedSpace::ROCmHostPinnedSpace()
{
}
void * ROCmSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = Kokkos::Impl::rocm_device_allocate( arg_alloc_size );
return ptr ;
}
void * Experimental::ROCmHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = Kokkos::Impl::rocm_hostpinned_allocate( arg_alloc_size );
return ptr ;
}
void ROCmSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
}
void Experimental::ROCmHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
Kokkos::Impl::rocm_device_free(arg_alloc_ptr);
}
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record ;
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record ;
std::string
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_label() const
{
SharedAllocationHeader header ;
Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
return std::string( header.m_label );
}
std::string
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_label() const
{
return std::string( RecordBase::head()->m_label );
}
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
allocate( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
}
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
allocate( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
}
void
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
void
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
~SharedAllocationRecord()
{
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
SharedAllocationHeader header ;
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) );
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmSpace::name()),header.m_label,
data(),size());
}
#endif
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
~SharedAllocationRecord()
{
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::ROCmHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
data(),size());
}
#endif
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_space( arg_space )
{
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
#endif
SharedAllocationHeader header ;
// Fill in the Header information
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
strncpy( header.m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
// Copy to device memory
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
}
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_space( arg_space )
{
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
#endif
// Fill in the Header information, directly accessible via host pinned memory
RecordBase::m_alloc_ptr->m_record = this ;
strncpy( RecordBase::m_alloc_ptr->m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
}
//----------------------------------------------------------------------------
void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
Kokkos::Impl::DeepCopy<Kokkos::Experimental::ROCmSpace,Kokkos::Experimental::ROCmSpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
#if 0
void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
Kokkos::Impl::DeepCopy<Experimental::ROCmHostPinnedSpace,Experimental::ROCmHostPinnedSpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
#endif
//----------------------------------------------------------------------------
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > *
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordBase = SharedAllocationRecord< void , void > ;
using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
#if 0
// Copy the header from the allocation
Header head ;
Header const * const head_rocm = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
if ( alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,Experimental::ROCmSpace>( & head , head_rocm , sizeof(SharedAllocationHeader) );
}
RecordROCm * const record = alloc_ptr ? static_cast< RecordROCm * >( head.m_record ) : (RecordROCm *) 0 ;
if ( ! alloc_ptr || record->m_alloc_ptr != head_rocm ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
}
#else
// Iterate the list to search for the record among all allocations
// requires obtaining the root of the list and then locking the list.
RecordROCm * const record = static_cast< RecordROCm * >( RecordBase::find( & s_root_record , alloc_ptr ) );
if ( record == 0 ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::get_record ERROR" ) );
}
#endif
return record ;
}
#if 0
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > *
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordROCm = SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void > ;
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::get_record ERROR" ) );
}
return static_cast< RecordROCm * >( h->m_record );
}
#endif
// Iterate records to print orphaned memory ...
void
SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >::
print_records( std::ostream & s , const Kokkos::Experimental::ROCmSpace & space , bool detail )
{
SharedAllocationRecord< void , void > * r = & s_root_record ;
char buffer[256] ;
SharedAllocationHeader head ;
if ( detail ) {
do {
if ( r->m_alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
}
else {
head.m_label[0] = 0 ;
}
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "ROCm addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "ROCm addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
}
snprintf( buffer , 256
, format_string
, reinterpret_cast<uintptr_t>( r )
, reinterpret_cast<uintptr_t>( r->m_prev )
, reinterpret_cast<uintptr_t>( r->m_next )
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
, r->m_alloc_size
, r->m_count
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, head.m_label
);
std::cout << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
else {
do {
if ( r->m_alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,Kokkos::Experimental::ROCmSpace>( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "ROCm [ 0x%.12lx + %ld ] %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "ROCm [ 0x%.12llx + %ld ] %s\n";
}
snprintf( buffer , 256
, format_string
, reinterpret_cast< uintptr_t >( r->data() )
, r->size()
, head.m_label
);
}
else {
snprintf( buffer , 256 , "ROCm [ 0 + 0 ]\n" );
}
std::cout << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
}
#if 0
void
SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >::
print_records( std::ostream & s , const Kokkos::Experimental::ROCmHostPinnedSpace & space , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "ROCmHostPinned" , & s_root_record , detail );
}
#endif
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
#if 0
KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_atomic() {
unsigned i = tindex()*team_size() + lindex();
if(i<ROCM_SPACE_ATOMIC_MASK+1)
kokkos_impl_rocm_lock_arrays.atomic[i] = 0;
}
KOKKOS_INLINE_FUNCTION void init_lock_array_kernel_scratch_threadid(int N) {
unsigned i = tindex()*team_size() + lindex();
if(i<N) {
kokkos_impl_rocm_lock_arrays.scratch[i] = 0;
kokkos_impl_rocm_lock_arrays.threadid[i] = 0;
}
}
}
namespace Impl {
int* atomic_lock_array_rocm_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
rocmFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
rocmMalloc(&ptr,sizeof(int)*(ROCM_SPACE_ATOMIC_MASK+1));
return ptr;
}
int* scratch_lock_array_rocm_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
rocmFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
return ptr;
}
int* threadid_lock_array_rocm_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
rocmFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
rocmMalloc(&ptr,sizeof(int)*(ROCm::concurrency()));
return ptr;
}
void init_lock_arrays_rocm_space() {
static int is_initialized = 0;
if(! is_initialized) {
Kokkos::Impl::ROCmLockArraysStruct locks;
locks.atomic = atomic_lock_array_rocm_space_ptr(false);
locks.scratch = scratch_lock_array_rocm_space_ptr(false);
locks.threadid = threadid_lock_array_rocm_space_ptr(false);
am_copyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
init_lock_array_kernel_atomic<<<(ROCM_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Experimental::ROCm::concurrency()+255)/256,256>>>(Kokkos::Experimental::ROCm::concurrency());
}
}
#endif
void* rocm_resize_scratch_space(size_t bytes, bool force_shrink) {
static void* ptr = NULL;
static size_t current_size = 0;
if(current_size == 0) {
current_size = bytes;
ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
}
if(bytes > current_size) {
current_size = bytes;
ptr = Kokkos::kokkos_realloc<Kokkos::Experimental::ROCmSpace>(ptr,current_size);
}
if((bytes < current_size) && (force_shrink)) {
current_size = bytes;
Kokkos::kokkos_free<Kokkos::Experimental::ROCmSpace>(ptr);
ptr = Kokkos::kokkos_malloc<Kokkos::Experimental::ROCmSpace>("ROCmSpace::ScratchMemory",current_size);
}
return ptr;
}
}
}
#endif // KOKKOS_ENABLE_ROCM

View File

@ -0,0 +1,174 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Experimental::ROCm > ;
//----------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver
( TaskQueueSpecialization< Kokkos::Experimental::ROCm >::queue_type * const queue,
hc::tiled_index<3> threadIdx )
{
using Member = TaskExec< Kokkos::Experimental::ROCm > ;
using Queue = TaskQueue< Kokkos::Experimental::ROCm > ;
using task_root_type = TaskBase< void , void , void > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec( 1, threadIdx );
Member team_exec( threadIdx.tile_dim[0], threadIdx );
const int wavefront_lane = threadIdx.local[0] + threadIdx.local[1]* threadIdx.tile_dim[0] ;
union {
task_root_type * ptr ;
int raw[2] ;
} task ;
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == wavefront_lane ) {
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<ROCm>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task.ptr));
#endif
}
// shuffle broadcast
task.raw[0] = hc::__shfl( task.raw[0] , 0 );
task.raw[1] = hc::__shfl( task.raw[1] , 0 );
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
if ( end != task.ptr ) {
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
// Thread Team Task
(*task.ptr->m_apply)( task.ptr , & team_exec );
}
else if ( 0 == threadIdx.local[1] ) {
// Single Thread Task
(*task.ptr->m_apply)( task.ptr , & single_exec );
}
if ( 0 == wavefront_lane ) {
queue->complete( task.ptr );
}
}
} while(1);
}
#if 0
namespace {
KOKKOS_INLINE_FUNCTION
void rocm_task_queue_execute( TaskQueue< Kokkos::Experimental::ROCm > * queue,
hc::tiled_index<3> threadIdx )
{ TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue, threadIdx ); }
}
#endif
void TaskQueueSpecialization< Kokkos::Experimental::ROCm >::execute
( TaskQueue< Kokkos::Experimental::ROCm > * const queue )
{
const int workgroups_per_wavefront = 4 ;
const int wavefront_size = Kokkos::Impl::ROCmTraits::WavefrontSize ;
const int cu_count = Kokkos::Impl::rocm_internal_cu_count();
// const dim3 grid( Kokkos::Impl::rocm_internal_cu_count() , 1 , 1 );
// const dim3 block( 1 , Kokkos::Impl::ROCmTraits::WorkGroupSize , workgroups_per_wavefront );
// Query the stack size, in bytes:
// If not large enough then set the stack size, in bytes:
// adapted from the cuda code. TODO: Not at all sure that this is the proper
// to map the cuda grid/blocks/3D tiling to HCC
#if 0
hc::extent< 3 > flat_extent( cu_count,
wavefront_size, workgroups_per_wavefront );
hc::tiled_extent< 3 > team_extent = flat_extent.tile(1,
wavefront_size,workgroups_per_wavefront);
hc::parallel_for_each( team_extent , [&](hc::tiled_index<3> idx) [[hc]]
{
TaskQueueSpecialization< Kokkos::Experimental::ROCm >::driver( queue,idx );
}).wait();
#endif
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_ROCM ) && defined( KOKKOS_ENABLE_TASKDAG ) */

View File

@ -0,0 +1,458 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_ROCM_TASK_HPP
#define KOKKOS_IMPL_ROCM_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKDAG )
#include <ROCm/Kokkos_ROCm_Vectorization.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class > class TaskExec ;
template<>
class TaskQueueSpecialization< Kokkos::Experimental::ROCm >
{
public:
using execution_space = Kokkos::Experimental::ROCm ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using member_type = TaskExec< execution_space > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
static
void iff_single_thread_recursive_execute( queue_type * const ) {}
KOKKOS_INLINE_FUNCTION
static void driver( queue_type * const, hc::tiled_index<3> );
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
static
void proc_set_apply( typename TaskBase< Kokkos::Experimental::ROCm
, typename FunctorType::value_type
, FunctorType
>::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::Experimental::ROCm
, typename FunctorType::value_type
, FunctorType
> ;
hc::extent< 1 > flat_extent( 1 );
hc::tiled_extent< 1 > team_extent = flat_extent.tile( 1);
hc::parallel_for_each( team_extent , [&](hc::tiled_index<1> idx) [[hc]]
{
*ptr = TaskType::apply ;
}).wait();
}
};
/*template<>
KOKKOS_FUNCTION
void TaskQueue<Kokkos::Experimental::ROCm>::decrement( typename TaskQueue<Kokkos::Experimental::ROCm>::task_root_type *
) {}
*/
extern template class TaskQueue< Kokkos::Experimental::ROCm > ;
//----------------------------------------------------------------------------
/**\brief Impl::TaskExec<ROCm> is the TaskScheduler<ROCm>::member_type
* passed to tasks running in a ROCm space.
*
* ROCm thread blocks for tasking are dimensioned:
* idx.tile_dim[0] == vector length
* idx.tile_dim[1] == team size
* idx.tile_dim[2] == number of teams
* where
* idx.tile_dim[0] * idx.tile_dim[1] == WavefrontSize
*
* Both single thread and thread team tasks are run by a full ROCm warp.
* A single thread task is called by warp lane #0 and the remaining
* lanes of the warp are idle.
*/
template<>
class TaskExec< Kokkos::Experimental::ROCm >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::ROCm > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::ROCm > ;
int m_team_size ;
hc::tiled_index<3> m_idx;
// KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size ) //TODO: tile_dim[0]
// : m_team_size( arg_team_size ) {}
KOKKOS_INLINE_FUNCTION TaskExec( int arg_team_size,
hc::tiled_index<3> tidx)
: m_team_size( arg_team_size),
m_idx( tidx ) {}
public:
// const auto local = t_idx.local[0];
// const auto global = t_idx.global[0];
// const auto tile = t_idx.tile[0];
hc::tiled_index<3> idx() const { return m_idx;}
#if defined( __HCC_ACCELERATOR__ )
KOKKOS_INLINE_FUNCTION void team_barrier() { /* __threadfence_block(); */ }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_idx.local[1] ; } // t_idx.tile[0];
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
#else
KOKKOS_INLINE_FUNCTION void team_barrier() {}
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return 0 ; }
#endif
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
TeamThreadRange
( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
}
template<typename iType1, typename iType2>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::TaskExec< Kokkos::Experimental::ROCm > >
TeamThreadRange
( Impl:: TaskExec< Kokkos::Experimental::ROCm > & thread, const iType1 & begin, const iType2 & end )
{
typedef typename std::common_type<iType1, iType2>::type iType;
return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread, begin, end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >
ThreadVectorRange
( Impl::TaskExec< Kokkos::Experimental::ROCm > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
// reduce across corresponding lanes between team members within workgroup
// assume stride*team_size == workgroup_size
template< typename ValueType >
KOKKOS_INLINE_FUNCTION
void strided_shfl_workgroup_reduction
(const ValueType& f(),
ValueType& val,
int team_size,
int stride)
{
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
f(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
}
}
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void strided_shfl_workgroup_reduction
(const JoinType& join,
ValueType& val,
int team_size,
int stride)
{
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
join(val, shfl_down(val, lane_delta, team_size*stride));
}
}
// multiple within-workgroup non-strided reductions
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void multi_shfl_workgroup_reduction
(const JoinType& join,
ValueType& val,
int vec_length)
{
for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
join(val, shfl_down(val, lane_delta, vec_length));
}
}
// broadcast within workgroup
template< class ValueType >
KOKKOS_INLINE_FUNCTION
ValueType shfl_workgroup_broadcast
(ValueType& val,
int src_lane,
int width)
{
return shfl(val, src_lane, width);
}
// all-reduce across corresponding vector lanes between team members within workgroup
// assume vec_length*team_size == workgroup_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template<typename iType, class Lambda, typename ValueType>
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries
, const Lambda& lambda
, ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
initialized_result = result;
strided_shfl_workgroup_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
loop_boundaries.thread.team_size(),
idx.tile_dim[0]);
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
strided_shfl_workgroup_reduction<ValueType, JoinType>(
join,
initialized_result,
loop_boundaries.thread.team_size(),
idx.tile_dim[0]);
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, idx.local[0], Impl::ROCmTraits::WavefrontSize );
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
ValueType result = initialized_result;
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
//initialized_result = multi_shfl_workgroup_reduction(
multi_shfl_workgroup_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
idx.tile_dim[0]);
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
multi_shfl_workgroup_reduction<ValueType, JoinType>(join, initialized_result, idx.tile_dim[0]);
initialized_result = shfl_workgroup_broadcast<ValueType>( initialized_result, 0, idx.tile_dim[0] );
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
const Lambda & lambda)
{
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-idx.tile_dim[0] exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = idx.tile_dim[0] ; offset < Impl::ROCmTraits::WavefrontSize ; offset <<= 1 ) {
y = shfl_up(val, offset, Impl::ROCmTraits::WavefrontSize);
if(idx.local[1]*idx.tile_dim[0] >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_workgroup_broadcast<ValueType>(val,
idx.local[0]+Impl::ROCmTraits::WavefrontSize-idx.tile_dim[0],
Impl::ROCmTraits::WavefrontSize);
// make EXCLUSIVE scan by shifting values over one
val = shfl_up(val, idx.tile_dim[0], Impl::ROCmTraits::WavefrontSize);
if ( idx.local[1] == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::ROCm > >& loop_boundaries,
const Lambda & lambda)
{
hc::tiled_index<3> idx = loop_boundaries.thread.idx();
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-idx.tile_dim[0] exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = 1 ; offset < idx.tile_dim[0] ; offset <<= 1 ) {
y = shfl_up(val, offset, idx.tile_dim[0]);
if(idx.local[0] >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_workgroup_broadcast<ValueType>(val, idx.tile_dim[0]-1,
idx.tile_dim[0]);
// make EXCLUSIVE scan by shifting values over one
val = shfl_up(val, 1, idx.tile_dim[0]);
if ( idx.local[0] == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_ROCM_TASK_HPP */

View File

@ -0,0 +1,518 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <hc.hpp>
#include <type_traits>
#include <vector>
#include <memory>
#include <ROCm/Kokkos_ROCm_Config.hpp>
#if !defined( KOKKOS_ROCM_TILE_H )
#define KOKKOS_ROCM_TILE_H
// Macro to abstract out the enable_if craziness
#define KOKKOS_ROCM_REQUIRES(...) \
bool KokkosROCmRequiresBool ## __LINE__ = true, typename std::enable_if<KokkosROCmRequiresBool ## __LINE__ && (__VA_ARGS__), int>::type = 0
// This number uniquely identifies the 1.5 release build.
#if __hcc_workweek__ > 17160
#define ROCM15 1
#endif
namespace Kokkos {
namespace Impl {
template<class T>
#if defined(ROCM15)
using lds_t = T;
#else
// prior to 1.5, needed to decorate LDS addresses
using lds_t = __attribute__((address_space(3))) T;
#endif
#define KOKKOS_ROCM_TILE_RESTRIC_CPU restrict(cpu, amp)
// a set of routines to the replace the std::routines
// that will operate on address space 3 types
#if defined(ROCM15)
// 1.5 can't use std::copy et al for LDS access, so we define our own
// set of routines
template<class I, class O>
void rcopy(I first, I last, O out) [[hc]]
{
while (first != last) *out++ = *first++;
}
template<class I,class F>
void rfor_each(I first, I last, F f) [[hc]]
{
for(;first!=last;++first) f(*first);
}
template<class I,class O,class F>
void rtransform(I first, I last, O out, F f) [[hc]]
{
while(first!=last) *out++ = f(*first++);
}
#endif
inline std::size_t get_max_tile_size() KOKKOS_ROCM_TILE_RESTRIC_CPU
{
return hc::accelerator().get_max_tile_static_size() - 1024;
}
inline std::size_t get_max_tile_thread() KOKKOS_ROCM_TILE_RESTRIC_CPU
{
return 64;
}
inline int next_pow_2(int x) restrict(cpu, amp)
{
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return x+1;
}
template<class T>
inline std::size_t get_tile_size(std::size_t n = 1,
std::size_t team = 64,
std::size_t vector = 1)
KOKKOS_ROCM_TILE_RESTRIC_CPU
{
const auto size = sizeof(T) * n;
const auto group_size = get_max_tile_size();
if (size == 0 || size > group_size) return 0;
// Assume that thread size is a power of 2
auto thread_size = std::min(team*vector,4*get_max_tile_thread());
// ensure that we have enough tile static memory to keep
// threadsize * size elements for reductions
while(size > (group_size / thread_size) && thread_size > 2)
{ thread_size /= 2;
}
return thread_size;
}
template<class T>
struct array_view
{
T* x;
std::size_t n;
array_view(T* xp, std::size_t np) [[hc]] [[cpu]]
: x(xp), n(np)
{}
array_view(T* xp, T* yp) [[hc]] [[cpu]]
: x(xp), n(yp-xp)
{}
T& operator[](std::size_t i) const [[hc]] [[cpu]]
{
return x[i];
}
std::size_t size() const [[hc]] [[cpu]]
{
return this->n;
}
T* data() const [[hc]] [[cpu]]
{
return x;
}
T* begin() const [[hc]] [[cpu]]
{
return x;
}
T* end() const [[hc]] [[cpu]]
{
return x+this->size();
}
};
template<class T>
struct rocm_char
{ using type=char; };
template<class T>
struct rocm_char<const T>
: std::add_const<typename rocm_char<T>::type>
{};
#if !defined(ROCM15)
// earlier compilers required explicit address space decorations
template<class T>
struct rocm_char<__attribute__((address_space(3))) T>
{ using type = __attribute__((address_space(3))) typename rocm_char<T>::type; };
template<class T>
struct rocm_char<const __attribute__((address_space(3))) T>
{ using type = const __attribute__((address_space(3))) typename rocm_char<T>::type; };
#endif
template<class T, class Char=typename rocm_char<T>::type>
Char* rocm_byte_cast(T& x) restrict(cpu, amp)
{
return reinterpret_cast<Char*>(&x);
}
template<class T, class U>
void rocm_raw_assign(T& x, const U& y) restrict(cpu, amp)
{
auto * src = rocm_byte_cast(y);
auto * dest = rocm_byte_cast(x);
#if defined (ROCM15)
rcopy(src, src+sizeof(T), dest);
#else
std::copy(src, src+sizeof(T), dest);
#endif
}
template<class T, class U>
void rocm_assign_impl(T& x, const U& y, std::true_type) restrict(cpu, amp)
{
rocm_raw_assign(x, y);
}
template<class T, class U>
void rocm_assign_impl(T& x, const U& y, std::false_type) restrict(cpu, amp)
{
x = y;
}
// Workaround for assigning in and out of LDS memory
template<class T, class U>
void rocm_assign(T& x, const U& y) restrict(cpu, amp)
{
rocm_assign_impl(x, y, std::integral_constant<bool, (
sizeof(T) == sizeof(U)
)>());
}
// Compute the address space of tile
template<class T>
struct tile_type
{
#if defined (ROCM15)
typedef T type;
#else
typedef __attribute__((address_space(3))) T type;
#endif
};
#if !defined (ROCM15)
template<class T, class Body>
void lds_for(__attribute__((address_space(3))) T& value, Body b) [[hc]]
{
T state = value;
b(state);
value = state;
}
#endif
template<class T, class Body>
void lds_for(T& value, Body b) [[hc]]
{
b(value);
}
constexpr std::size_t get_max_tile_array_size()
{
return 24;
}
template<class Derived, class T>
struct single_action
{
template<class Action>
void action_at(std::size_t i, Action a) [[hc]]
{
auto& value = static_cast<Derived&>(*this)[i];
#if KOKKOS_ROCM_HAS_WORKAROUNDS
T state = value;
a(state);
value = state;
#else
a(value);
#endif
}
template<class Action>
void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
{
static_cast<Derived&>(*this).action_at(i, [&](T& x)
{
static_cast<Derived&>(*this).action_at(j, [&](T& y)
{
a(x, y);
});
});
}
};
template<class T>
struct tile_buffer
: array_view<typename tile_type<T>::type>, single_action<tile_buffer<T>, T>
{
typedef typename tile_type<T>::type element_type;
typedef array_view<element_type> base;
using base::base;
tile_buffer(element_type* xp, std::size_t np, std::size_t) [[hc]] [[cpu]]
: base(xp, np)
{}
tile_buffer(T* xp, T* yp, std::size_t) [[hc]] [[cpu]]
: base(xp, yp)
{}
};
template<class T>
struct tile_buffer<T[]>
{
typedef typename tile_type<T>::type element_type;
typedef typename tile_type<char>::type tchar_type;
element_type* element_data;
std::size_t n, m;
tile_buffer(element_type* xp, std::size_t np, std::size_t mp) [[hc]] [[cpu]]
: element_data(xp), n(np), m(mp)
{}
tile_buffer(element_type* xp, element_type* yp, std::size_t mp) [[hc]] [[cpu]]
: element_data(xp), n(yp-xp), m(mp)
{}
element_type* operator[](std::size_t i) const [[hc]] [[cpu]]
{
return element_data+i*m;
}
template<class Action, class Q = T>
typename Impl::enable_if< (sizeof(Q) <= 8) , void >::type
action_at(std::size_t i, Action a) [[hc]]
{
element_type* value = (*this)[i];
#if defined (ROCM15)
a(value);
#else
#if KOKKOS_ROCM_HAS_WORKAROUNDS
if (m > get_max_tile_array_size()) return;
T state[get_max_tile_array_size()];
// std::copy(value, value+m, state);
// Workaround for assigning from LDS memory
std::transform(value, value+m, state, [](element_type& x)
{
T result;
rocm_assign(result, x);
return result;
});
a(state);
std::copy(state, state+m, value);
#endif
#endif
}
template<class Action, class Q = T>
typename Impl::enable_if< !(sizeof(Q) <= 8) , void >::type
action_at(std::size_t i, Action a) [[hc]]
{
element_type* value = (*this)[i];
#if defined (ROCM15)
a(value);
#else
//#if KOKKOS_ROCM_HAS_WORKAROUNDS
if (m > get_max_tile_array_size()) return;
T state[get_max_tile_array_size()];
// std::copy(value, value+m, state);
// Workaround for assigning from LDS memory
std::transform(value, value+m, state, [](element_type& x)
{
T result;
rocm_assign(result, x);
return result;
});
a(state);
// this workaround required when T is greater than 8 bytes
tile_static char tv[64*sizeof(T)];
size_t sT = sizeof(T);
for (int j = 0; j<sT; j++) tv[i*sT+j] = ((char *)state)[j];
for (int j = 0; j<sT; j++) ((tchar_type *)value)[j] = tv[i*sT+j];
#endif
}
template<class Action>
void action_at(std::size_t i, std::size_t j, Action a) [[hc]]
{
this->action_at(i, [&](T* x)
{
this->action_at(j, [&](T* y)
{
a(x, y);
});
});
}
std::size_t size() const [[hc]] [[cpu]]
{
return this->n;
}
element_type* data() const [[hc]] [[cpu]]
{
return element_data;
}
};
// Zero initialize LDS memory
struct zero_init_f
{
template<class T>
#if defined (ROCM15)
void operator()(T& x, std::size_t=1) const [[hc]]
{
auto * start = reinterpret_cast<char*>(&x);
for(int i=0; i<sizeof(T);i++) start[i] = 0;
rocm_raw_assign(x, T());
}
#else
void operator()(__attribute__((address_space(3))) T& x, std::size_t=1) const [[hc]]
{
auto * start = reinterpret_cast<__attribute__((address_space(3))) char*>(&x);
std::fill(start, start+sizeof(T), 0);
rocm_raw_assign(x, T());
}
#endif
template<class T>
#if defined (ROCM15)
void operator()(T* x, std::size_t size) const [[hc]]
{
rfor_each(x, x+size, *this);
}
#else
void operator()(__attribute__((address_space(3))) T* x, std::size_t size) const [[hc]]
{
std::for_each(x, x+size, *this);
}
#endif
};
static constexpr zero_init_f zero_init = {};
struct tile_desc
{
// Number of work items, or size of extent
std::size_t elements;
// number of threads in team
std::size_t team_size;
// vector length of team
std::size_t vector_length;
// Size of tile
std::size_t tile_size;
// Size of array
std::size_t array_size;
// Number of tiles
std::size_t num_tiles;
// Per team reserved LDS memory, used for reduction
std::size_t reduce_size;
// Per team shared memory in LDS, this in addition to reduce shared mem
std::size_t shared_size;
std::size_t size;
};
template<class T>
tile_desc get_tile_desc(std::size_t size,
std::size_t array_size=1,
std::size_t team_size=64,
std::size_t vector_size=1,
std::size_t shared_size=0)
{
tile_desc result;
result.elements = size;
result.array_size = array_size;
result.vector_length = vector_size;
result.team_size = team_size;
result.tile_size = get_tile_size<T>(array_size,team_size,vector_size);
result.num_tiles = std::ceil(1.0 * size / result.tile_size);
result.reduce_size = result.tile_size * sizeof(T) * array_size;
result.shared_size = shared_size;
result.size = result.tile_size * result.num_tiles;
return result;
}
template<class U, class F, class T=typename std::remove_extent<U>::type>
hc::completion_future tile_for(tile_desc td, F f)
{
assert(td.array_size <= get_max_tile_array_size() && "Exceed max array size");
assert(((td.size % td.tile_size) == 0) && "Tile size must be divisible by extent");
auto grid = hc::extent<1>(td.size).tile_with_dynamic(
td.tile_size, td.reduce_size + td.shared_size);
// grid.set_dynamic_group_segment_size(td.reduce_size + td.shared_size);
return parallel_for_each(grid, [=](hc::tiled_index<1> t_idx) [[hc]]
{
#if defined (ROCM15)
typedef T group_t;
#else
typedef __attribute__((address_space(3))) T group_t;
#endif
group_t * buffer = (group_t *)hc::get_dynamic_group_segment_base_pointer();
tile_buffer<U> tb(buffer, td.tile_size, td.array_size);
zero_init(tb[t_idx.local[0]], td.array_size);
f(t_idx, tb);
});
}
}}
#endif

View File

@ -0,0 +1,346 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ROCM_VECTORIZATION_HPP
#define KOKKOS_ROCM_VECTORIZATION_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if ROCM is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_ROCM
#include <Kokkos_ROCm.hpp>
namespace Kokkos {
using namespace hc;
// Shuffle only makes sense on >= Fiji GPUs; it doesn't work on CPUs
// or other GPUs. We provide a generic definition (which is trivial
// and doesn't do what it claims to do) because we don't actually use
// this function unless we are on a suitable GPU, with a suitable
// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow"
// internal parameter depends both on the ExecutionSpace and the Scalar type,
// and it controls whether shfl_down() gets called.)
namespace Impl {
template< typename Scalar >
struct shfl_union {
enum {n = sizeof(Scalar)/4};
float fval[n];
KOKKOS_INLINE_FUNCTION
Scalar value() {
return *(Scalar*) fval;
}
KOKKOS_INLINE_FUNCTION
void operator= (Scalar& value_) {
float* const val_ptr = (float*) &value_;
for(int i=0; i<n ; i++) {
fval[i] = val_ptr[i];
}
}
KOKKOS_INLINE_FUNCTION
void operator= (const Scalar& value_) {
float* const val_ptr = (float*) &value_;
for(int i=0; i<n ; i++) {
fval[i] = val_ptr[i];
}
}
};
}
#ifdef __HCC_ACCELERATOR__
KOKKOS_INLINE_FUNCTION
int __long2loint(const long val ) {
union {
long l;
int i[2];
} u;
u.l = val;
return u.i[0];
}
KOKKOS_INLINE_FUNCTION
int __long2hiint(const long val ) {
union {
long l;
int i[2];
} u;
u.l = val;
return u.i[1];
}
KOKKOS_INLINE_FUNCTION
int __double2loint(const double val ) {
union {
double d;
int i[2];
} u;
u.d = val;
return u.i[0];
}
KOKKOS_INLINE_FUNCTION
int __double2hiint(const double val ) {
union {
double d;
int i[2];
} u;
u.d = val;
return u.i[1];
}
KOKKOS_INLINE_FUNCTION
long __hiloint2long(const int hi, const int lo ) {
union {
long l;
int i[2];
} u;
u.i[0] = lo;
u.i[1] = hi;
return u.l;
}
KOKKOS_INLINE_FUNCTION
double __hiloint2double(const int hi, const int lo ) {
union {
double d;
int i[2];
} u;
u.i[0] = lo;
u.i[1] = hi;
return u.d;
}
KOKKOS_INLINE_FUNCTION
int shfl(const int &val, const int& srcLane, const int& width ) {
return __shfl(val,srcLane,width);
}
KOKKOS_INLINE_FUNCTION
float shfl(const float &val, const int& srcLane, const int& width ) {
return __shfl(val,srcLane,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl(tmp,srcLane,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
double shfl(const double &val, const int& srcLane, const int& width) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo,srcLane,width);
hi = __shfl(hi,srcLane,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl(lo,srcLane,width);
hi = __shfl(hi,srcLane,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
return r_val.value();
}
KOKKOS_INLINE_FUNCTION
int shfl_down(const int &val, const int& delta, const int& width) {
return __shfl_down(val,delta,width);
}
KOKKOS_INLINE_FUNCTION
float shfl_down(const float &val, const int& delta, const int& width) {
return __shfl_down(val,delta,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl_down(tmp,delta,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
long shfl_down(const long &val, const int& delta, const int& width) {
int lo = __long2loint(val);
int hi = __long2hiint(val);
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
return __hiloint2long(hi,lo);
}
KOKKOS_INLINE_FUNCTION
double shfl_down(const double &val, const int& delta, const int& width) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
return r_val.value();
}
KOKKOS_INLINE_FUNCTION
int shfl_up(const int &val, const int& delta, const int& width ) {
return __shfl_up(val,delta,width);
}
KOKKOS_INLINE_FUNCTION
float shfl_up(const float &val, const int& delta, const int& width ) {
return __shfl_up(val,delta,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl_up(tmp,delta,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
double shfl_up(const double &val, const int& delta, const int& width ) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo,delta,width);
hi = __shfl_up(hi,delta,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_up(lo,delta,width);
hi = __shfl_up(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
return r_val.value();
}
#else
template<typename Scalar>
inline
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<8.0.");
return val;
}
template<typename Scalar>
inline
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
return val;
}
template<typename Scalar>
inline
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<8.0.");
return val;
}
#endif
}
#endif // KOKKOS_ENABLE_ROCM
#endif

View File

@ -0,0 +1,367 @@
#pragma once
#include "hc.hpp"
#include <cmath>
// Math functions with integer overloads will be converted to
// this floating point type.
#define HC_IMPLICIT_FLOAT_CONV double
#ifdef __KALMAR_ACCELERATOR__
#define HC_MATH_WRAPPER_1(function, arg1) \
template<typename T> \
inline T function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define KALMAR_MATH_WRAPPER_1(function, arg1) HC_MATH_WRAPPER_1(function, arg1)
#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
template<typename T> \
inline \
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
} \
template<typename T> \
inline \
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1)
#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
template<typename T> \
inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1, arg2); \
}
#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
template<typename T> \
inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1, arg2); \
}
#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
template<typename T> \
inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1, arg2, arg3); \
}
#define HC_MATH_WRAPPER_TQ(function, arg1) \
template<typename T, typename Q> \
inline T function(Q arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
template<typename Q> \
inline \
typename std::enable_if<std::is_integral<Q>::value,T>::type \
function(Q arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
}\
template<typename Q> \
inline \
typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
function(Q arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
template<typename T, typename Q> \
inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1, arg2); \
}
#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
template<typename T, typename Q> \
inline \
typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
}\
template<typename T, typename Q> \
inline \
typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1,arg2); \
}
#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
template<typename T, typename Q> \
inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1, arg2, arg3); \
}
#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
template<typename T, typename Q> \
inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
hc::precise_math::function(arg1, arg2, arg3); \
}
#else
#define HC_MATH_WRAPPER_1(function, arg1) \
template<typename T> \
inline T function(T arg1) __attribute__((hc,cpu)) { \
return std::function(arg1); \
}
#define KALMAR_MATH_WRAPPER_1(function, arg1) \
template<typename T> \
inline T function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define HC_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
template<typename T> \
inline \
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return ::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
} \
template<typename T> \
inline \
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return std::function(arg1); \
}
#define KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(function, arg1) \
template<typename T> \
inline \
typename std::enable_if<std::is_integral<T>::value,HC_IMPLICIT_FLOAT_CONV>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
} \
template<typename T> \
inline \
typename std::enable_if<std::is_floating_point <T>::value,T>::type \
function(T arg1) __attribute__((hc,cpu)) { \
return hc::precise_math::function(arg1); \
}
#define HC_MATH_WRAPPER_2(function, arg1, arg2) \
template<typename T> \
inline T function(T arg1, T arg2) __attribute__((hc,cpu)) { \
return std::function(arg1, arg2); \
}
#define HC_MATH_ALIAS_2(alias, function, arg1, arg2) \
template<typename T> \
inline T alias(T arg1, T arg2) __attribute__((hc,cpu)) { \
return std::function(arg1, arg2); \
}
#define HC_MATH_WRAPPER_3(function, arg1, arg2, arg3) \
template<typename T> \
inline T function(T arg1, T arg2, T arg3) __attribute__((hc,cpu)) { \
return std::function(arg1, arg2, arg3); \
}
#define HC_MATH_WRAPPER_TQ(function, arg1) \
template<typename T, typename Q> \
inline T function(Q arg1) __attribute__((hc,cpu)) { \
return std::function(arg1); \
}
#define HC_MATH_WRAPPER_FP_OVERLOAD_TQ(function, T, arg1) \
template<typename Q> \
inline \
typename std::enable_if<std::is_integral<Q>::value,T>::type \
function(Q arg1) __attribute__((hc)) { \
return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1)); \
}\
template<typename Q> \
inline \
typename std::enable_if<std::is_floating_point<Q>::value,T>::type \
function(Q arg1) __attribute__((hc)) { \
return std::function(arg1); \
}
#define HC_MATH_WRAPPER_TTQ(function, arg1, arg2) \
template<typename T, typename Q> \
inline T function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return std::function(arg1, arg2); \
}
#define HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(function, arg1, arg2) \
template<typename T, typename Q> \
inline \
typename std::enable_if<std::is_integral<T>::value||std::is_integral<Q>::value,HC_IMPLICIT_FLOAT_CONV>::type \
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return std::function(static_cast<HC_IMPLICIT_FLOAT_CONV>(arg1),static_cast<HC_IMPLICIT_FLOAT_CONV>(arg2)); \
}\
template<typename T, typename Q> \
inline \
typename std::enable_if<std::is_floating_point<T>::value&&std::is_floating_point<Q>::value,T>::type \
function(T arg1, Q arg2) __attribute__((hc,cpu)) { \
return std::function(arg1,arg2); \
}
#define HC_MATH_WRAPPER_TTTQ(function, arg1, arg2, arg3) \
template<typename T, typename Q> \
inline T function(T arg1, T arg2, Q arg3) __attribute__((hc,cpu)) { \
return std::function(arg1, arg2, arg3); \
}
#define HC_MATH_WRAPPER_VTQQ(function, arg1, arg2, arg3) \
template<typename T, typename Q> \
inline void function(T arg1, Q arg2, Q arg3) __attribute__((hc,cpu)) { \
std::function(arg1, arg2, arg3); \
}
#endif
// override global math functions
namespace std {
// following math functions are NOT available because they don't have a GPU implementation
//
// erfinv
// erfcinv
// fpclassify
//
// following math functions are NOT available because they don't have a CPU implementation
//
// cospif
// cospi
// rsqrtf
// rsqrt
// sinpif
// sinpi
// tanpi
//
HC_MATH_WRAPPER_TQ(ilogbf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(ilogb, int, x)
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isfinite, bool, x)
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isinf, bool, x)
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnan, bool, x)
HC_MATH_WRAPPER_FP_OVERLOAD_TQ(isnormal, bool, x)
HC_MATH_WRAPPER_TQ(nanf, tagp)
HC_MATH_WRAPPER_TQ(nan, tagp)
//HC_MATH_WRAPPER_TQ(signbitf, x)
HC_MATH_WRAPPER_TQ(signbit, x)
HC_MATH_WRAPPER_TTQ(frexpf, x, exp)
HC_MATH_WRAPPER_TTQ(frexp, x, exp)
HC_MATH_WRAPPER_TTQ(ldexpf, x, exp)
HC_MATH_WRAPPER_TTQ(ldexp, x, exp)
HC_MATH_WRAPPER_TTQ(lgammaf, x, exp)
HC_MATH_WRAPPER_TTQ(lgamma, x, exp)
HC_MATH_WRAPPER_TTQ(modff, x, exp)
HC_MATH_WRAPPER_TTQ(modf, x, exp)
HC_MATH_WRAPPER_TTQ(scalbnf, x, exp)
HC_MATH_WRAPPER_TTQ(scalbn, x, exp)
HC_MATH_WRAPPER_TTTQ(remquof, x, y, quo)
HC_MATH_WRAPPER_TTTQ(remquo, x, y, quo)
HC_MATH_WRAPPER_VTQQ(sincosf, x, s, c)
HC_MATH_WRAPPER_VTQQ(sincos, x, s, c)
HC_MATH_WRAPPER_1(acosf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(acos, x)
HC_MATH_WRAPPER_1(acoshf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(acosh, x)
HC_MATH_WRAPPER_1(asinf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(asin, x)
HC_MATH_WRAPPER_1(asinhf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(asinh, x)
HC_MATH_WRAPPER_1(atanf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(atan, x)
HC_MATH_WRAPPER_1(atanhf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(atanh, x)
HC_MATH_WRAPPER_2(atan2f, x, y)
HC_MATH_WRAPPER_2(atan2, x, y)
HC_MATH_WRAPPER_1(cbrtf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(cbrt, x)
HC_MATH_WRAPPER_1(ceilf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(ceil, x)
HC_MATH_WRAPPER_2(copysignf, x, y)
HC_MATH_WRAPPER_2(copysign, x, y)
HC_MATH_WRAPPER_1(cosf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(cos, x)
HC_MATH_WRAPPER_1(coshf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(cosh, x)
KALMAR_MATH_WRAPPER_1(cospif, x)
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(cospi, x)
HC_MATH_WRAPPER_1(erff, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(erf, x)
HC_MATH_WRAPPER_1(erfcf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(erfc, x)
HC_MATH_WRAPPER_1(expf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp, x)
HC_MATH_WRAPPER_1(exp2f, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp2, x)
HC_MATH_WRAPPER_1(exp10f, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(exp10, x)
HC_MATH_WRAPPER_1(expm1f, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(expm1, x)
HC_MATH_WRAPPER_1(fabsf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(fabs, x)
HC_MATH_WRAPPER_2(fdimf, x, y)
HC_MATH_WRAPPER_2(fdim, x, y)
HC_MATH_WRAPPER_1(floorf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(floor, x)
HC_MATH_WRAPPER_3(fmaf, x, y, z)
HC_MATH_WRAPPER_3(fma, x, y, z)
HC_MATH_WRAPPER_2(fmaxf, x, y)
HC_MATH_WRAPPER_2(fmax, x, y)
HC_MATH_WRAPPER_2(fminf, x, y)
HC_MATH_WRAPPER_2(fmin, x, y)
HC_MATH_WRAPPER_2(fmodf, x, y)
HC_MATH_WRAPPER_2(fmod, x, y)
HC_MATH_WRAPPER_2(hypotf, x, y)
HC_MATH_WRAPPER_2(hypot, x, y)
HC_MATH_WRAPPER_1(logf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(log, x)
HC_MATH_WRAPPER_1(log10f, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(log10, x)
HC_MATH_WRAPPER_1(log2f, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(log2, x)
HC_MATH_WRAPPER_1(log1pf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(log1p, x)
HC_MATH_WRAPPER_1(logbf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(logb, x)
HC_MATH_WRAPPER_1(nearbyintf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(nearbyint, x)
HC_MATH_WRAPPER_2(nextafterf, x, y)
HC_MATH_WRAPPER_2(nextafter, x, y)
HC_MATH_WRAPPER_2(powf, x, y)
HC_MATH_WRAPPER_FP_OVERLOAD_TTQ(pow,x,y)
//HC_MATH_WRAPPER_1(rcbrtf, x)
//HC_MATH_WRAPPER_1(rcbrt, x)
HC_MATH_WRAPPER_2(remainderf, x, y)
HC_MATH_WRAPPER_2(remainder, x, y)
HC_MATH_WRAPPER_1(roundf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(round, x)
KALMAR_MATH_WRAPPER_1(rsqrtf, x)
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(rsqrt, x)
HC_MATH_WRAPPER_2(scalbf, x, exp)
HC_MATH_WRAPPER_2(scalb, x, exp)
HC_MATH_WRAPPER_1(sinf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(sin, x)
HC_MATH_WRAPPER_1(sinhf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(sinh, x)
KALMAR_MATH_WRAPPER_1(sinpif, x)
KALMAR_MATH_WRAPPER_FP_OVERLOAD_1(sinpi, x)
HC_MATH_WRAPPER_1(sqrtf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(sqrt, x)
HC_MATH_WRAPPER_1(tgammaf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(tgamma, x)
HC_MATH_WRAPPER_1(tanf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(tan, x)
HC_MATH_WRAPPER_1(tanhf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(tanh, x)
HC_MATH_WRAPPER_1(truncf, x)
HC_MATH_WRAPPER_FP_OVERLOAD_1(trunc, x)
//HC_MATH_ALIAS_2(min, fmin, x, y)
//HC_MATH_ALIAS_2(max, fmax, x, y)
} // namespace

View File

@ -125,6 +125,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
//----------------------------------------------------------------------------
// GCC native CAS supports int, long, unsigned int, unsigned long.
// Intel native CAS support int and long with the same interface as GCC.
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
@ -280,6 +281,7 @@ T atomic_compare_exchange( volatile T * const dest, const T compare, const T val
#endif
#endif
#endif // !defined ROCM_ATOMICS
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -158,6 +158,7 @@ T atomic_fetch_add( volatile T * const dest ,
#endif
#endif
//----------------------------------------------------------------------------
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
@ -355,6 +356,7 @@ T atomic_fetch_add( volatile T * const dest , const T val )
#endif
#endif
#endif // !defined ROCM_ATOMICS
//----------------------------------------------------------------------------
// Simpler version of atomic_fetch_add without the fetch

View File

@ -135,6 +135,7 @@ T atomic_fetch_sub( volatile T * const dest ,
#endif
#endif
//----------------------------------------------------------------------------
#if !defined(KOKKOS_ENABLE_ROCM_ATOMICS)
#if !defined(__CUDA_ARCH__) || defined(KOKKOS_IMPL_CUDA_CLANG_WORKAROUND)
#if defined(KOKKOS_ENABLE_GNU_ATOMICS) || defined(KOKKOS_ENABLE_INTEL_ATOMICS)
@ -263,6 +264,8 @@ T atomic_fetch_sub( volatile T * const dest , const T val )
#endif
#endif
#endif // !defined ROCM_ATOMICS
// Simpler version of atomic_fetch_sub without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION

View File

@ -238,7 +238,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
*dest = Oper::apply(return_val, val);
Impl::unlock_address_host_space( (void*) dest );
return return_val;
#else
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
// This is a way to (hopefully) avoid dead lock in a warp
T return_val;
int done = 0;
@ -277,7 +277,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
*dest = return_val;
Impl::unlock_address_host_space( (void*) dest );
return return_val;
#else
#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA)
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
int done = 0;

View File

@ -62,6 +62,8 @@ int bit_first_zero( unsigned i ) noexcept
#if defined( __CUDA_ARCH__ )
return full != i ? __ffs( ~i ) - 1 : -1 ;
#elif defined( __HCC_ACCELERATOR__ )
return full != i ? (int)hc::__firstbit_u32_u32(~i) : -1 ;
#elif defined( KOKKOS_COMPILER_INTEL )
return full != i ? _bit_scan_forward( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_IBM )
@ -82,6 +84,8 @@ int bit_scan_forward( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __HCC_ACCELERATOR__ )
return (int)hc::__firstbit_u32_u32(i);
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_forward(i);
#elif defined( KOKKOS_COMPILER_IBM )
@ -106,6 +110,8 @@ int bit_scan_reverse( unsigned i )
enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( __HCC_ACCELERATOR__ )
return (int)hc::__firstbit_u32_u32(i);
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_reverse(i);
#elif defined( KOKKOS_COMPILER_IBM )
@ -130,6 +136,8 @@ int bit_count( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __HCC_ACCELERATOR__ )
return (int)hc::__popcount_u32_b32(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#elif defined( KOKKOS_COMPILER_IBM )

View File

@ -72,6 +72,10 @@ uint64_t clock_tic(void) noexcept
return clock64();
#elif defined(__HCC_ACCELERATOR__)
// Get clock register
return hc::__clock_u64();
#elif defined( __i386__ ) || defined( __x86_64 )
// Return value of 64-bit hi-res clock register.

View File

@ -80,7 +80,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
const int num_threads = args.num_threads;
const int use_numa = args.num_numa;
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
#if defined( KOKKOS_ENABLE_CUDA )
#if defined( KOKKOS_ENABLE_CUDA ) || defined( KOKKOS_ENABLE_ROCM )
const int use_gpu = args.device_id;
#endif // defined( KOKKOS_ENABLE_CUDA )
@ -162,6 +162,18 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
}
#endif
#if defined( KOKKOS_ENABLE_ROCM )
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
if (use_gpu > -1) {
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( use_gpu ) );
}
else {
Kokkos::Experimental::ROCm::initialize();
}
std::cout << "Kokkos::initialize() fyi: ROCm enabled and initialized" << std::endl ;
}
#endif
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
@ -181,6 +193,13 @@ void finalize_internal( const bool all_spaces = false )
}
#endif
#if defined( KOKKOS_ENABLE_ROCM )
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
if(Kokkos::Experimental::ROCm::is_initialized())
Kokkos::Experimental::ROCm::finalize();
}
#endif
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
if(Kokkos::Experimental::OpenMPTarget::is_initialized())
@ -225,6 +244,12 @@ void fence_internal()
}
#endif
#if defined( KOKKOS_ENABLE_ROCM )
if( std::is_same< Kokkos::Experimental::ROCm , Kokkos::DefaultExecutionSpace >::value ) {
Kokkos::Experimental::ROCm::fence();
}
#endif
#if defined( KOKKOS_ENABLE_OPENMP )
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {

View File

@ -75,7 +75,7 @@ void abort( const char * const message ) {
#ifdef __CUDA_ARCH__
Kokkos::Impl::cuda_abort(message);
#else
#ifndef KOKKOS_ENABLE_OPENMPTARGET
#if !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(__HCC_ACCELERATOR__)
Kokkos::Impl::host_abort(message);
#endif
#endif

View File

@ -275,7 +275,9 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
for ( int i = 0 ; i < end ; ++i ) {
((int8_t*) & value )[i] = int8_t( step );
}
// Do not REMOVE this store fence!!!
// Makes stuff hang on GCC with more than 8 threads
store_fence();
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value );
}

View File

@ -53,6 +53,8 @@ void memory_fence()
{
#if defined( __CUDA_ARCH__ )
__threadfence();
#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"mfence" ::: "memory"

View File

@ -140,12 +140,6 @@
#endif
#endif
#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
#ifndef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
#endif
#endif
#ifdef KOKKOS_HAVE_CXX1Z
#ifndef KOKKOS_ENABLE_CXX1Z
#define KOKKOS_ENABLE_CXX1Z KOKKOS_HAVE_CXX1Z

View File

@ -456,10 +456,11 @@ void TaskQueue< ExecSpace >::schedule_aggregate
// task->m_next == member of linked list (queue)
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
, task->m_dep_count
, task->m_task_type
, task->m_priority
, task->m_ref_count );
@ -597,7 +598,6 @@ void TaskQueue< ExecSpace >::complete
, task->m_task_type
, task->m_priority
, task->m_ref_count );
fflush( stdout );
#endif
task_root_type volatile & t = *task ;

View File

@ -1015,8 +1015,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
{
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
, "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" );
static_assert(
( DimRHS::rank == 0 &&
dimension_type::rank == 0 ) ||
( DimRHS::rank == 1 &&
dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1 )
, "ViewOffset LayoutLeft and LayoutRight are only compatible when rank <= 1" );
}
template< class DimRHS >
@ -1024,8 +1029,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
{
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
, "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
static_assert(
( DimRHS::rank == 0 &&
dimension_type::rank == 0 ) ||
( DimRHS::rank == 1 &&
dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1 )
, "ViewOffset LayoutLeft and LayoutStride are only compatible when rank <= 1" );
if ( rhs.m_stride.S0 != 1 ) {
Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft from LayoutStride requires stride == 1" );
}
@ -1493,8 +1503,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
{
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
, "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" );
static_assert(
( DimRHS::rank == 0 &&
dimension_type::rank == 0 ) ||
( DimRHS::rank == 1 &&
dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1 )
, "ViewOffset LayoutRight and LayoutLeft are only compatible when rank <= 1" );
}
template< class DimRHS >
@ -1502,8 +1517,13 @@ struct ViewOffset< Dimension , Kokkos::LayoutRight
ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
: m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
{
static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
, "ViewOffset LayoutLeft/Right and LayoutStride are only compatible when rank == 1" );
static_assert(
( DimRHS::rank == 0 &&
dimension_type::rank == 0 ) ||
( DimRHS::rank == 1 &&
dimension_type::rank == 1 &&
dimension_type::rank_dynamic == 1 )
, "ViewOffset LayoutRight and LayoutString are only compatible when rank <= 1" );
if ( rhs.m_stride.S0 != 1 ) {
Kokkos::abort("Kokkos::Impl::ViewOffset assignment of LayoutLeft/Right from LayoutStride requires stride == 1" );
}

View File

@ -143,7 +143,7 @@ public:
//----------------------------------------
~ViewOffset() = default ;
KOKKOS_FUNCTION_DEFAULTED ~ViewOffset() = default ;
KOKKOS_INLINE_FUNCTION ViewOffset() = default ;
KOKKOS_INLINE_FUNCTION ViewOffset( const ViewOffset & ) = default ;
KOKKOS_INLINE_FUNCTION ViewOffset & operator = ( const ViewOffset & ) = default ;