Update Kokkos library to v2.03.13

This commit is contained in:
Stan Moore
2017-07-31 10:34:21 -06:00
parent 6d0a228624
commit 13f2d39f55
171 changed files with 44039 additions and 2807 deletions

File diff suppressed because it is too large Load Diff

View File

@ -53,6 +53,7 @@
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -125,53 +126,12 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
#endif
namespace Kokkos {
namespace Impl {
struct CudaLockArraysStruct {
int* atomic;
int* scratch;
int* threadid;
int n;
};
}
}
__device__ __constant__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#endif
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
}
}
namespace Kokkos {
namespace Impl {
__device__ inline
bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
}
__device__ inline
void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
}
}
}
template< typename T >
inline
__device__
@ -192,7 +152,7 @@ namespace Impl {
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType >
template< class DriverType>
__global__
static void cuda_parallel_launch_constant_memory()
{
@ -202,19 +162,39 @@ static void cuda_parallel_launch_constant_memory()
driver();
}
template< class DriverType >
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType>
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType ,
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType
, class LaunchBounds = Kokkos::LaunchBounds<>
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType >
struct CudaParallelLaunch< DriverType , true > {
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
inline
CudaParallelLaunch( const DriverType & driver
@ -238,26 +218,19 @@ struct CudaParallelLaunch< DriverType , true > {
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
@ -267,8 +240,8 @@ struct CudaParallelLaunch< DriverType , true > {
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType , false > {
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
inline
CudaParallelLaunch( const DriverType & driver
@ -284,22 +257,15 @@ struct CudaParallelLaunch< DriverType , false > {
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
}
#endif
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -230,18 +230,6 @@ void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t
} catch(...) {}
}
constexpr const char* CudaSpace::name() {
return m_name;
}
constexpr const char* CudaUVMSpace::name() {
return m_name;
}
constexpr const char* CudaHostPinnedSpace::name() {
return m_name;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -655,11 +643,12 @@ reallocate_tracked( void * const arg_alloc_ptr
SharedAllocationRecord< Kokkos::CudaSpace , void > *
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordBase = SharedAllocationRecord< void , void > ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
#if 0
using Header = SharedAllocationHeader ;
// Copy the header from the allocation
Header head ;
@ -812,83 +801,6 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1)
kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
}
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<N) {
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
}
}
}
namespace Impl {
int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
return ptr;
}
int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
void init_lock_arrays_cuda_space() {
static int is_initialized = 0;
if(! is_initialized) {
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
}
}
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
static void* ptr = NULL;
static std::int64_t current_size = 0;
@ -908,8 +820,8 @@ void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
return ptr;
}
}
}
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA

View File

@ -51,6 +51,7 @@
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -69,9 +70,6 @@
__device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
__device__ __constant__
Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#endif
/*--------------------------------------------------------------------------*/
@ -103,6 +101,7 @@ int cuda_kernel_arch()
return arch ;
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
bool cuda_launch_blocking()
{
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
@ -111,16 +110,13 @@ bool cuda_launch_blocking()
return atoi(env);
}
#endif
}
void cuda_device_synchronize()
{
// static const bool launch_blocking = cuda_launch_blocking();
// if (!launch_blocking) {
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
// }
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
@ -240,6 +236,7 @@ public:
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
uint32_t m_maxConcurrency ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type m_scratchUnifiedCount ;
@ -248,6 +245,7 @@ public:
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
size_type * m_scratchUnified ;
uint32_t * m_scratchConcurrentBitset ;
cudaStream_t * m_stream ;
static int was_initialized;
@ -274,6 +272,7 @@ public:
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_maxConcurrency( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
@ -282,6 +281,7 @@ public:
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
, m_scratchConcurrentBitset( 0 )
, m_stream( 0 )
{}
@ -327,7 +327,8 @@ CudaInternal::~CudaInternal()
if ( m_stream ||
m_scratchSpace ||
m_scratchFlags ||
m_scratchUnified ) {
m_scratchUnified ||
m_scratchConcurrentBitset ) {
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
<< std::endl ;
std::cerr.flush();
@ -339,6 +340,7 @@ CudaInternal::~CudaInternal()
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_maxConcurrency = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchUnifiedCount = 0 ;
@ -347,6 +349,7 @@ CudaInternal::~CudaInternal()
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_scratchConcurrentBitset = 0 ;
m_stream = 0 ;
}
@ -485,6 +488,33 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
}
//----------------------------------
// Concurrent bitset for obtaining unique tokens from within
// an executing kernel.
{
const unsigned max_threads_per_sm = 2048 ; // up to capability 7.0
m_maxConcurrency =
max_threads_per_sm * cudaProp.multiProcessorCount ;
const int32_t buffer_bound =
Kokkos::Impl::concurrent_bitset::buffer_bound( m_maxConcurrency );
// Allocate and initialize uint32_t[ buffer_bound ]
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchBitset"
, sizeof(uint32_t) * buffer_bound );
Record::increment( r );
m_scratchConcurrentBitset = reinterpret_cast<uint32_t *>( r->data() );
CUDA_SAFE_CALL( cudaMemset( m_scratchConcurrentBitset , 0 , sizeof(uint32_t) * buffer_bound ) );
}
//----------------------------------
if ( stream_count ) {
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
@ -543,16 +573,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_arrays_cuda_space();
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
Impl::initialize_host_cuda_lock_arrays();
}
//----------------------------------------------------------------------------
@ -635,9 +656,7 @@ void CudaInternal::finalize()
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
atomic_lock_array_cuda_space_ptr(true);
scratch_lock_array_cuda_space_ptr(true);
threadid_lock_array_cuda_space_ptr(true);
Impl::finalize_host_cuda_lock_arrays();
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@ -653,6 +672,7 @@ void CudaInternal::finalize()
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchConcurrentBitset ) );
m_cudaDev = -1 ;
m_multiProcCount = 0 ;
@ -666,6 +686,7 @@ void CudaInternal::finalize()
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_scratchConcurrentBitset = 0 ;
m_stream = 0 ;
}
}
@ -713,9 +734,8 @@ namespace Kokkos {
Cuda::size_type Cuda::detect_device_count()
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
int Cuda::concurrency() {
return 131072;
}
int Cuda::concurrency()
{ return Impl::CudaInternal::singleton().m_maxConcurrency ; }
int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); }
@ -798,7 +818,22 @@ void Cuda::fence()
const char* Cuda::name() { return "Cuda"; }
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
UniqueToken< Kokkos::Cuda , Kokkos::Experimental::UniqueTokenScope::Global >::
UniqueToken( Kokkos::Cuda const & )
: m_buffer( Kokkos::Impl::CudaInternal::singleton().m_scratchConcurrentBitset )
, m_count( Kokkos::Impl::CudaInternal::singleton().m_maxConcurrency )
{}
} // namespace Experimental
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA

View File

@ -0,0 +1,119 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Kokkos_Cuda.hpp>
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
namespace Kokkos {
namespace Impl {
__device__ __constant__
CudaLockArrays g_device_cuda_lock_arrays = { nullptr, nullptr, 0 };
}
}
#endif
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1) {
Kokkos::Impl::g_device_cuda_lock_arrays.atomic[i] = 0;
}
}
__global__ void init_lock_array_kernel_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<(unsigned)N) {
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[i] = 0;
}
}
} // namespace
namespace Impl {
CudaLockArrays g_host_cuda_lock_arrays = { nullptr, nullptr, 0 };
void initialize_host_cuda_lock_arrays() {
if (g_host_cuda_lock_arrays.atomic != nullptr) return;
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.atomic,
sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1)));
CUDA_SAFE_CALL(cudaMalloc(&g_host_cuda_lock_arrays.scratch,
sizeof(int)*(Cuda::concurrency())));
CUDA_SAFE_CALL(cudaDeviceSynchronize());
g_host_cuda_lock_arrays.n = Cuda::concurrency();
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+1+255)/256,256>>>();
init_lock_array_kernel_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
CUDA_SAFE_CALL(cudaDeviceSynchronize());
}
void finalize_host_cuda_lock_arrays() {
if (g_host_cuda_lock_arrays.atomic == nullptr) return;
cudaFree(g_host_cuda_lock_arrays.atomic);
g_host_cuda_lock_arrays.atomic = nullptr;
cudaFree(g_host_cuda_lock_arrays.scratch);
g_host_cuda_lock_arrays.scratch = nullptr;
g_host_cuda_lock_arrays.n = 0;
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE();
#endif
}
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_CUDA_LOCKS_PREVENT_LINK_ERROR() {}
#endif

View File

@ -0,0 +1,166 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_LOCKS_HPP
#define KOKKOS_CUDA_LOCKS_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <cstdint>
#include <Cuda/Kokkos_Cuda_Error.hpp>
namespace Kokkos {
namespace Impl {
struct CudaLockArrays {
std::int32_t* atomic;
std::int32_t* scratch;
std::int32_t n;
};
/// \brief This global variable in Host space is the central definition
/// of these arrays.
extern Kokkos::Impl::CudaLockArrays g_host_cuda_lock_arrays ;
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// valid, initialized arrays.
///
/// This call is idempotent.
void initialize_host_cuda_lock_arrays();
/// \brief After this call, the g_host_cuda_lock_arrays variable has
/// all null pointers, and all array memory has been freed.
///
/// This call is idempotent.
void finalize_host_cuda_lock_arrays();
} // namespace Impl
} // namespace Kokkos
#if defined( __CUDACC__ )
namespace Kokkos {
namespace Impl {
/// \brief This global variable in CUDA space is what kernels use
/// to get access to the lock arrays.
///
/// When relocatable device code is enabled, there can be one single
/// instance of this global variable for the entire executable,
/// whose definition will be in Kokkos_Cuda_Locks.cpp (and whose declaration
/// here must then be extern.
/// This one instance will be initialized by initialize_host_cuda_lock_arrays
/// and need not be modified afterwards.
///
/// When relocatable device code is disabled, an instance of this variable
/// will be created in every translation unit that sees this header file
/// (we make this clear by marking it static, meaning no other translation
/// unit can link to it).
/// Since the Kokkos_Cuda_Locks.cpp translation unit cannot initialize the
/// instances in other translation units, we must update this CUDA global
/// variable based on the Host global variable prior to running any kernels
/// that will use it.
/// That is the purpose of the KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE macro.
__device__ __constant__
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#endif
Kokkos::Impl::CudaLockArrays g_device_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
__device__ inline
bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
return (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.atomic[offset],0,1));
}
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
__device__ inline
void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
atomicExch( &Kokkos::Impl::g_device_cuda_lock_arrays.atomic[ offset ], 0);
}
} // namespace Impl
} // namespace Kokkos
/* Dan Ibanez: it is critical that this code be a macro, so that it will
capture the right address for Kokkos::Impl::g_device_cuda_lock_arrays!
putting this in an inline function will NOT do the right thing! */
#define KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE() \
{ \
CUDA_SAFE_CALL(cudaMemcpyToSymbol( \
Kokkos::Impl::g_device_cuda_lock_arrays , \
& Kokkos::Impl::g_host_cuda_lock_arrays , \
sizeof(Kokkos::Impl::CudaLockArrays) ) ); \
}
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE()
#else
#define KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE() KOKKOS_COPY_CUDA_LOCK_ARRAYS_TO_DEVICE()
#endif
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_LOCKS_HPP */

View File

@ -58,6 +58,7 @@
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Kokkos_Vectorization.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
@ -65,6 +66,8 @@
#include <typeinfo>
#endif
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -318,6 +321,7 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -363,7 +367,7 @@ public:
const dim3 block( 1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
const dim3 grid( std::min( ( nwork + block.y - 1 ) / block.y , cuda_internal_maximum_grid_count() ) , 1 , 1);
CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
ParallelFor( const FunctorType & arg_functor ,
@ -373,6 +377,115 @@ public:
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
using RP = Policy;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::launch_bounds LaunchBounds;
const FunctorType m_functor ;
const Policy m_rp ;
public:
inline
__device__
void operator()(void) const
{
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
}
inline
void execute() const
{
const array_index_type maxblocks = static_cast<array_index_type>(Kokkos::Impl::CudaTraits::UpperBoundGridCount);
if ( RP::rank == 2 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , 1);
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, 1
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 3 )
{
const dim3 block( m_rp.m_tile[0] , m_rp.m_tile[1] , m_rp.m_tile[2] );
const dim3 grid(
std::min( ( m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1 ) / block.x , maxblocks )
, std::min( ( m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 4 )
{
// id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2] , m_rp.m_tile[3] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1 ) / block.y , maxblocks )
, std::min( ( m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 5 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( ( m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1 ) / block.z , maxblocks )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else if ( RP::rank == 6 )
{
// id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to threadIdx.z
const dim3 block( m_rp.m_tile[0]*m_rp.m_tile[1] , m_rp.m_tile[2]*m_rp.m_tile[3] , m_rp.m_tile[4]*m_rp.m_tile[5] );
const dim3 grid(
std::min( static_cast<index_type>( m_rp.m_tile_end[0] * m_rp.m_tile_end[1] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[2] * m_rp.m_tile_end[3] )
, static_cast<index_type>(maxblocks) )
, std::min( static_cast<index_type>( m_rp.m_tile_end[4] * m_rp.m_tile_end[5] )
, static_cast<index_type>(maxblocks) )
);
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this , grid , block , 0 );
}
else
{
printf("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n");
Kokkos::abort("Aborting");
}
} //end execute
// inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_functor( arg_functor )
, m_rp( arg_policy )
{}
};
template< class FunctorType , class ... Properties >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Properties ... >
@ -384,6 +497,7 @@ private:
typedef TeamPolicyInternal< Kokkos::Cuda , Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
public:
@ -430,15 +544,15 @@ public:
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
@ -448,7 +562,8 @@ public:
}
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >(
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
@ -462,7 +577,7 @@ public:
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
@ -473,7 +588,7 @@ public:
const dim3 grid( int(m_league_size) , 1 , 1 );
const dim3 block( int(m_vector_size) , int(m_team_size) , 1 );
CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
CudaParallelLaunch< ParallelFor, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
}
@ -529,6 +644,7 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
@ -563,6 +679,7 @@ private:
typedef int DummySHMEMReductionType;
public:
// Make the exec_range calls call to Reduce::DeviceIterateTile
template< class TagType >
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
@ -686,7 +803,7 @@ public:
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
@ -737,6 +854,232 @@ public:
{ }
};
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::launch_bounds LaunchBounds;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::value_type value_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef FunctorType functor_type ;
typedef Cuda::size_type size_type ;
// Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
const FunctorType m_functor ;
const Policy m_policy ; // used for workrange and nwork
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
size_type * m_scratch_space ;
size_type * m_scratch_flags ;
size_type * m_unified_space ;
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
// Some crutch to do function overloading
private:
typedef double DummyShflReductionType;
typedef int DummySHMEMReductionType;
public:
inline
__device__
void
exec_range( reference_type update ) const
{
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
}
inline
__device__
void operator() (void) const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
}
__device__ inline
void run(const DummySHMEMReductionType& ) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
{
reference_type value =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
this-> exec_range( value );
}
// Reduce with final value at blockDim.y - 1 location.
// Problem: non power-of-two blockDim
if ( cuda_single_inter_block_reduce_scan<false,ReducerTypeFwd,WorkTag>(
ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
// This is the final block with the final result at the final threads' location
size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
if ( threadIdx.y == 0 ) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
}
if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
}
}
__device__ inline
void run(const DummyShflReductionType&) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Number of blocks is bounded so that the reduction can be limited to two passes.
// Each thread block is given an approximately equal amount of work to perform.
// Accumulate the values for this block.
// The accumulation ordering does not match the final pass, but is arithmatically equivalent.
const Member work_part =
( ( m_policy.m_num_tiles + ( gridDim.x - 1 ) ) / gridDim.x ); //portion of tiles handled by each block
this-> exec_range( value );
pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
int max_active_thread = work_part < blockDim.y ? work_part:blockDim.y;
max_active_thread = (max_active_thread == 0)?blockDim.y:max_active_thread;
value_type init;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
if(Impl::cuda_inter_block_reduction<ReducerTypeFwd,ValueJoin,WorkTag>
(value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
if(id==0) {
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
*result = value;
}
}
}
// Determine block size constrained by shared memory:
static inline
unsigned local_block_size( const FunctorType & f )
{
unsigned n = CudaTraits::WarpSize * 8 ;
while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( f , n ) ) { n >>= 1 ; }
return n ;
}
inline
void execute()
{
const int nwork = m_policy.m_num_tiles;
if ( nwork ) {
int block_size = m_policy.m_prod_tile_dims;
// CONSTRAINT: Algorithm requires block_size >= product of tile dimensions
// Nearest power of two
int exponent_pow_two = std::ceil( std::log2(block_size) );
block_size = std::pow(2, exponent_pow_two);
int suggested_blocksize = local_block_size( m_functor );
block_size = (block_size > suggested_blocksize) ? block_size : suggested_blocksize ; //Note: block_size must be less than or equal to 512
m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_size /* block_size == max block_count */ );
m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) );
// REQUIRED ( 1 , N , 1 )
const dim3 block( 1 , block_size , 1 );
// Required grid.x <= block.y
const dim3 grid( std::min( int(block.y) , int( nwork ) ) , 1 , 1 );
const int shmem = UseShflReduction?0:cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,WorkTag>( m_functor , block.y );
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
Cuda::fence();
if ( m_result_ptr ) {
if ( m_unified_space ) {
const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
}
else {
const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
}
}
}
else {
if (m_result_ptr) {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , m_result_ptr );
}
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result
, typename std::enable_if<
Kokkos::is_view< HostViewType >::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ReducerType & reducer)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().ptr_on_device() )
, m_scratch_space( 0 )
, m_scratch_flags( 0 )
, m_unified_space( 0 )
{}
};
//----------------------------------------------------------------------------
#if 1
@ -753,6 +1096,7 @@ private:
typedef TeamPolicyInternal< Kokkos::Cuda, Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
@ -819,15 +1163,15 @@ public:
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % Kokkos::Impl::g_device_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
done = (0 == atomicCAS(&Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
if(threadid > Kokkos::Impl::g_device_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
@ -840,7 +1184,7 @@ public:
if ( m_scratch_size[1]>0 ) {
__syncthreads();
if (threadIdx.x==0 && threadIdx.y==0 )
kokkos_impl_cuda_lock_arrays.atomic[threadid]=0;
Kokkos::Impl::g_device_cuda_lock_arrays.scratch[threadid]=0;
}
}
@ -854,7 +1198,8 @@ public:
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
// Iterate this block through the league
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
@ -894,7 +1239,8 @@ public:
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
// Iterate this block through the league
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
const int int_league_size = (int)m_league_size;
for ( int league_rank = blockIdx.x ; league_rank < int_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
@ -936,7 +1282,7 @@ public:
const dim3 grid( block_count , 1 , 1 );
const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute
Cuda::fence();
@ -975,12 +1321,6 @@ public:
, m_shmem_begin( 0 )
, m_shmem_size( 0 )
, m_scratch_ptr{NULL,NULL}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
, m_scratch_size{
arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
@ -991,6 +1331,12 @@ public:
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
)}
, m_league_size( arg_policy.league_size() )
, m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
arg_policy.vector_length() )
, m_vector_size( arg_policy.vector_length() )
{
// Return Init value if the number of worksets is zero
if( arg_policy.league_size() == 0) {
@ -1150,6 +1496,7 @@ private:
typedef typename reducer_type<>::pointer_type pointer_type ;
typedef typename reducer_type<>::reference_type reference_type ;
typedef typename reducer_type<>::value_type value_type ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::REDUCE
@ -1273,7 +1620,7 @@ public:
const int shmem = m_shmem_team_begin + m_shmem_team_size ;
// copy to device and execute
CudaParallelLaunch<ParallelReduce>( *this, grid, block, shmem );
CudaParallelLaunch<ParallelReduce,LaunchBounds>( *this, grid, block, shmem );
Cuda::fence();
@ -1373,7 +1720,7 @@ public:
if ( CudaTraits::WarpSize < team_threads ) {
// Need inter-warp team reduction (collectives) shared memory
// Speculate an upper bound for the value size
// Speculate an upper bound for the value size
m_shmem_team_begin =
align_scratch( CudaTraits::warp_count(team_threads) * sizeof(double) );
@ -1426,7 +1773,7 @@ public:
// Reduce space has claim flag followed by vaue buffer
const int global_reduce_value_size =
max_concurrent_block *
max_concurrent_block *
( aligned_flag_size + align_scratch( value_size ) );
// Scratch space has claim flag followed by scratch buffer
@ -1469,6 +1816,7 @@ private:
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::launch_bounds LaunchBounds ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
@ -1655,10 +2003,10 @@ public:
const int shmem = ValueTraits::value_size( m_functor ) * ( block_size + 2 );
m_final = false ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
m_final = true ;
CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
CudaParallelLaunch< ParallelScan, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute
}
}

View File

@ -151,7 +151,7 @@ template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_warp_reduction( ValueType& result,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
const uint32_t max_active_thread = blockDim.y) {
unsigned int shift = 1;
@ -268,29 +268,33 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
if( id + 1 < int(gridDim.x) )
join(value, tmp);
}
int active = __ballot(1);
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
join(value, tmp);
}
active += __ballot(1);
}
}
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
#else
@ -302,7 +306,7 @@ template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_warp_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
const uint32_t max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
@ -428,26 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer,
if( id + 1 < int(gridDim.x) )
reducer.join(value, tmp);
}
int active = __ballot(1);
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
reducer.join(value, tmp);
}
active += __ballot(1);
}
}
@ -594,7 +603,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
//typedef typename ValueTraits::reference_type reference_type ;
// '__ffs' = position of the least significant bit set to 1.
// 'blockDim.y' is guaranteed to be a power of two so this
@ -637,7 +646,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
{
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
reference_type shared_value = ValueInit::init( functor , shared_ptr );
/* reference_type shared_value = */ ValueInit::init( functor , shared_ptr );
for ( size_type i = b ; i < e ; ++i ) {
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );

View File

@ -58,25 +58,56 @@ template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
#if defined( KOKKOS_DEBUG )
__device__
void verify_warp_convergence( const char * const where )
{
const unsigned b = __ballot(1);
if ( b != ~0u ) {
printf(" verify_warp_convergence( %s ) (%d,%d,%d) (%d,%d,%d) failed %x\n"
, where
, blockIdx.x
, blockIdx.y
, blockIdx.z
, threadIdx.x
, threadIdx.y
, threadIdx.z
, b );
}
}
#endif // #if defined( KOKKOS_DEBUG )
//----------------------------------------------------------------------------
__device__
void TaskQueueSpecialization< Kokkos::Cuda >::driver
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue
, int32_t shmem_per_warp )
{
using Member = TaskExec< Kokkos::Cuda > ;
using Queue = TaskQueue< Kokkos::Cuda > ;
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
extern __shared__ int32_t shmem_all[];
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec( 1 );
Member team_exec( blockDim.y );
int32_t * const warp_shmem =
shmem_all + ( threadIdx.z * shmem_per_warp ) / sizeof(int32_t);
task_root_type * const task_shmem = (task_root_type *) warp_shmem ;
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
union {
task_root_type * ptr ;
int raw[2] ;
} task ;
Member single_exec( warp_shmem , 1 );
Member team_exec( warp_shmem , blockDim.y );
task_root_type * task_ptr ;
// Loop until all queues are empty and no tasks in flight
@ -87,41 +118,86 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
if ( 0 == warp_lane ) {
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
task_ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
for ( int i = 0 ; i < Queue::NumQueue && end == task_ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task_ptr ; ++j ) {
task_ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task.ptr));
, uintptr_t(task_ptr));
#endif
}
// shuffle broadcast
task.raw[0] = __shfl( task.raw[0] , 0 );
task.raw[1] = __shfl( task.raw[1] , 0 );
((int*) & task_ptr )[0] = __shfl( ((int*) & task_ptr )[0] , 0 );
((int*) & task_ptr )[1] = __shfl( ((int*) & task_ptr )[1] , 0 );
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
#if defined( KOKKOS_DEBUG )
verify_warp_convergence("task_ptr");
#endif
if ( end != task.ptr ) {
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
if ( 0 == task_ptr ) break ; // 0 == queue->m_ready_count
if ( end != task_ptr ) {
// Whole warp copy task's closure to/from shared memory.
// Use all threads of warp for coalesced read/write.
int32_t const b = sizeof(task_root_type) / sizeof(int32_t);
int32_t const e = *((int32_t volatile *)( & task_ptr->m_alloc_size )) / sizeof(int32_t);
int32_t volatile * const task_mem = (int32_t volatile *) task_ptr ;
// copy global to shared memory:
for ( int32_t i = warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
warp_shmem[i] = task_mem[i] ;
}
Kokkos::memory_fence();
// Copy done - use memory fence so that memory writes are visible.
// For reliable warp convergence on Pascal and Volta an explicit
// warp level synchronization will also be required.
if ( task_root_type::TaskTeam == task_shmem->m_task_type ) {
// Thread Team Task
(*task.ptr->m_apply)( task.ptr , & team_exec );
(*task_shmem->m_apply)( task_shmem , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task.ptr->m_apply)( task.ptr , & single_exec );
(*task_shmem->m_apply)( task_shmem , & single_exec );
}
// copy shared to global memory:
for ( int32_t i = b + warp_lane ; i < e ; i += CudaTraits::WarpSize ) {
task_mem[i] = warp_shmem[i] ;
}
Kokkos::memory_fence();
#if defined( KOKKOS_DEBUG )
verify_warp_convergence("apply");
#endif
// If respawn requested copy respawn data back to main memory
if ( 0 == warp_lane ) {
queue->complete( task.ptr );
if ( ((task_root_type *) task_root_type::LockTag) != task_shmem->m_next ) {
( (volatile task_root_type *) task_ptr )->m_next = task_shmem->m_next ;
( (volatile task_root_type *) task_ptr )->m_priority = task_shmem->m_priority ;
}
queue->complete( task_ptr );
}
}
} while(1);
@ -130,18 +206,20 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
namespace {
__global__
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue
, int32_t shmem_size )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue , shmem_size ); }
}
void TaskQueueSpecialization< Kokkos::Cuda >::execute
( TaskQueue< Kokkos::Cuda > * const queue )
{
const int shared_per_warp = 2048 ;
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const int shared_total = shared_per_warp * warps_per_block ;
const cudaStream_t stream = 0 ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@ -159,7 +237,7 @@ printf("cuda_task_queue_execute before\n");
//
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
cuda_task_queue_execute<<< grid , block , shared_total , stream >>>( queue , shared_per_warp );
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -57,7 +57,7 @@ namespace {
template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
( TaskBase<void,void,void>::function_type * ptr )
{ *ptr = TaskType::apply ; }
}
@ -78,7 +78,7 @@ public:
void iff_single_thread_recursive_execute( queue_type * const ) {}
__device__
static void driver( queue_type * const );
static void driver( queue_type * const , int32_t );
static
void execute( queue_type * const );
@ -106,7 +106,14 @@ public:
extern template class TaskQueue< Kokkos::Cuda > ;
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/**\brief Impl::TaskExec<Cuda> is the TaskScheduler<Cuda>::member_type
* passed to tasks running in a Cuda space.
*
@ -134,11 +141,13 @@ private:
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
int32_t * m_team_shmem ;
const int m_team_size ;
__device__
TaskExec( int arg_team_size = blockDim.y )
: m_team_size( arg_team_size ) {}
TaskExec( int32_t * arg_team_shmem , int arg_team_size = blockDim.y )
: m_team_shmem( arg_team_shmem )
, m_team_size( arg_team_size ) {}
public:
@ -154,7 +163,13 @@ public:
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >

View File

@ -106,7 +106,7 @@ private:
typedef Kokkos::Cuda execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
void * m_team_reduce ;
mutable void * m_team_reduce ;
scratch_memory_space m_team_shared ;
int m_team_reduce_size ;
int m_league_rank ;
@ -166,7 +166,7 @@ public:
if ( 1 == blockDim.z ) { // team == block
__syncthreads();
// Wait for shared data write until all threads arrive here
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
*((ValueType*) m_team_reduce) = val ;
}
__syncthreads(); // Wait for shared data read until root thread writes
@ -210,7 +210,7 @@ public:
const int wx =
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
@ -354,7 +354,7 @@ public:
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
if ( (int)threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
}
// Broadcast from root lane to all other lanes.
@ -410,7 +410,7 @@ public:
value_type tmp( reducer.reference() );
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
@ -479,7 +479,7 @@ public:
__threadfence(); // Wait until global write is visible.
last_block = gridDim.x ==
last_block = (int)gridDim.x ==
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
// If last block then reset count
@ -509,7 +509,7 @@ public:
reducer.copy( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space) + offset );
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
for ( int i = nentry + tid ; i < (int)gridDim.x ; i += nentry ) {
reducer.join( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space)
+ i * reducer.length() );
@ -576,6 +576,14 @@ public:
, m_league_size( arg_league_size )
{}
public:
// Declare to avoid unused private member warnings which are trigger
// when SFINAE excludes the member function which uses these variables
// Making another class a friend also surpresses these warnings
bool impl_avoid_sfinae_warning() const noexcept
{
return m_team_reduce_size > 0 && m_team_reduce != nullptr;
}
};
} // namspace Impl
@ -913,10 +921,10 @@ void parallel_scan
// [t] += [t-4] if t >= 4
// ...
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
for ( int j = 1 ; j < (int)blockDim.x ; j <<= 1 ) {
value_type tmp = 0 ;
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
if ( j <= threadIdx.x ) { sval += tmp ; }
if ( j <= (int)threadIdx.x ) { sval += tmp ; }
}
// Include accumulation and remove value for exclusive scan:

View File

@ -0,0 +1,133 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_UNIQUE_TOKEN_HPP
#define KOKKOS_CUDA_UNIQUE_TOKEN_HPP
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_UniqueToken.hpp>
#include <impl/Kokkos_SharedAlloc.hpp>
#include <impl/Kokkos_ConcurrentBitset.hpp>
namespace Kokkos { namespace Experimental {
// both global and instance Unique Tokens are implemented in the same way
template<>
class UniqueToken< Cuda, UniqueTokenScope::Global >
{
private:
uint32_t volatile * m_buffer ;
uint32_t m_count ;
public:
using execution_space = Cuda;
explicit
UniqueToken( execution_space const& );
KOKKOS_INLINE_FUNCTION
UniqueToken() : m_buffer(0), m_count(0) {}
KOKKOS_INLINE_FUNCTION
UniqueToken( const UniqueToken & ) = default;
KOKKOS_INLINE_FUNCTION
UniqueToken( UniqueToken && ) = default;
KOKKOS_INLINE_FUNCTION
UniqueToken & operator=( const UniqueToken & ) = default ;
KOKKOS_INLINE_FUNCTION
UniqueToken & operator=( UniqueToken && ) = default ;
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int32_t size() const noexcept { return m_count ; }
/// \brief acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int32_t acquire() const
{
const Kokkos::pair<int,int> result =
Kokkos::Impl::concurrent_bitset::
acquire_bounded( m_buffer
, m_count
, Kokkos::Impl::clock_tic() % m_count
);
if ( result.first < 0 ) {
Kokkos::abort("UniqueToken<Cuda> failure to release tokens, no tokens available" );
}
return result.first;
}
/// \brief release an acquired value
KOKKOS_INLINE_FUNCTION
void release( int32_t i ) const noexcept
{
Kokkos::Impl::concurrent_bitset::release( m_buffer, i );
}
};
template<>
class UniqueToken< Cuda, UniqueTokenScope::Instance >
: public UniqueToken< Cuda, UniqueTokenScope::Global >
{
public:
explicit
UniqueToken( execution_space const& arg )
: UniqueToken< Cuda, UniqueTokenScope::Global >( arg ) {}
};
}} // namespace Kokkos::Experimental
#endif // KOKKOS_ENABLE_CUDA
#endif // KOKKOS_CUDA_UNIQUE_TOKEN_HPP

View File

@ -221,7 +221,6 @@ struct CudaLDGFetch {
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
@ -294,9 +293,8 @@ public:
}
};
}
}
}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,119 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
#define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::Cuda
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Cuda,
Traits ...
>
{
public:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ;
private:
template< class TagType >
__device__
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
template< class TagType >
__device__
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
public:
__device__
inline
void operator()() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
}
}
inline
void execute()
{
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const cudaStream_t stream = 0 ;
Kokkos::Impl::CudaParallelLaunch<Self>(*this, grid, block, shared, stream);
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP */

View File

@ -52,6 +52,7 @@
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
#endif
namespace Kokkos { namespace Experimental {
@ -120,28 +121,17 @@ struct MDRangePolicy
, typename traits::index_type
> ;
typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
using iteration_pattern = typename traits::iteration_pattern;
using work_tag = typename traits::work_tag;
using launch_bounds = typename traits::launch_bounds;
using member_type = typename range_policy::member_type;
static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename traits::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename traits::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Right = static_cast<int>( Iterate::Right );
static constexpr int Left = static_cast<int>( Iterate::Left );
enum { rank = static_cast<int>(iteration_pattern::rank) };
using index_type = typename traits::index_type;
using array_index_type = long;
@ -155,11 +145,50 @@ struct MDRangePolicy
// This would require the user to either pass a matching index_type parameter
// as template parameter to the MDRangePolicy or static_cast the individual values
point_type m_lower;
point_type m_upper;
tile_type m_tile;
point_type m_tile_end;
index_type m_num_tiles;
index_type m_prod_tile_dims;
/*
// NDE enum impl definition alternative - replace static constexpr int ?
enum { outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename traits::execution_space>::value ) };
enum { inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename traits::execution_space>::value ) };
enum { Right = static_cast<int>( Iterate::Right ) };
enum { Left = static_cast<int>( Iterate::Left ) };
*/
//static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename traits::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename traits::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Right = static_cast<int>( Iterate::Right );
static constexpr int Left = static_cast<int>( Iterate::Left );
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
: m_lower(lower)
, m_upper(upper)
, m_tile(tile)
, m_num_tiles(1)
, m_prod_tile_dims(1)
{
// Host
if ( true
@ -172,8 +201,8 @@ struct MDRangePolicy
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
if ( m_tile[i] <= 0 ) {
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
}
@ -183,6 +212,7 @@ struct MDRangePolicy
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
@ -190,14 +220,18 @@ struct MDRangePolicy
{
index_type span;
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
if ( m_prod_tile_dims < 512 ) {
m_tile[i] = 2;
} else {
m_tile[i] = 1;
}
}
else {
m_tile[i] = 16;
@ -205,12 +239,9 @@ struct MDRangePolicy
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
index_type total_tile_size_check = 1;
for (int i=0; i<rank; ++i) {
total_tile_size_check *= m_tile[i];
}
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
@ -223,19 +254,7 @@ struct MDRangePolicy
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
{
#if 0
// This should work, less duplicated code but not yet extensively tested
point_type lower_tmp, upper_tmp;
tile_type tile_tmp;
for ( auto i = 0; i < rank; ++i ) {
lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]);
}
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
#else
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
@ -249,7 +268,7 @@ struct MDRangePolicy
}
m_num_tiles = 1;
m_prod_tile_dims = 1;
// Host
if ( true
@ -262,8 +281,8 @@ struct MDRangePolicy
for (int i=0; i<rank; ++i) {
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
}
@ -273,6 +292,7 @@ struct MDRangePolicy
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
@ -284,10 +304,14 @@ struct MDRangePolicy
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
if ( m_prod_tile_dims < 512 ) {
m_tile[i] = 2;
} else {
m_tile[i] = 1;
}
}
else {
m_tile[i] = 16;
@ -295,32 +319,22 @@ struct MDRangePolicy
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
index_type total_tile_size_check = 1;
for (int i=0; i<rank; ++i) {
total_tile_size_check *= m_tile[i];
}
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
}
}
#endif
#endif
}
point_type m_lower;
point_type m_upper;
tile_type m_tile;
point_type m_tile_end;
index_type m_num_tiles;
};
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_for
//md_parallel_for - deprecated use parallel_for
// ------------------------------------------------------------------ //
template <typename MDRange, typename Functor, typename Enable = void>
void md_parallel_for( MDRange const& range
@ -335,7 +349,6 @@ void md_parallel_for( MDRange const& range
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@ -354,7 +367,6 @@ void md_parallel_for( const std::string& str
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
@ -395,7 +407,7 @@ void md_parallel_for( MDRange const& range
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_reduce
//md_parallel_reduce - deprecated use parallel_reduce
// ------------------------------------------------------------------ //
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( MDRange const& range
@ -409,9 +421,8 @@ void md_parallel_reduce( MDRange const& range
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
@ -428,48 +439,14 @@ void md_parallel_reduce( const std::string& str
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
// Cuda - parallel_reduce not implemented yet
/*
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( MDRange const& range
, Functor const& f
, ValueType & v
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
closure.execute();
}
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( const std::string& str
, MDRange const& range
, Functor const& f
, ValueType & v
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
closure.execute();
}
*/
// Cuda - md_parallel_reduce not implemented - use parallel_reduce
}} // namespace Kokkos::Experimental

View File

@ -114,40 +114,9 @@
#endif /* Not pre-selected atomic implementation */
#endif
//----------------------------------------------------------------------------
// Forward decalaration of functions supporting arbitrary sized atomics
// This is necessary since Kokkos_Atomic.hpp is internally included very early
// through Kokkos_HostSpace.hpp as well as the allocation tracker.
#ifdef KOKKOS_ENABLE_CUDA
namespace Kokkos {
namespace Impl {
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#endif
__device__ inline
bool lock_address_cuda_space(void* ptr);
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
extern
#endif
__device__ inline
void unlock_address_cuda_space(void* ptr);
}
}
#endif
namespace Kokkos {
template <typename T>

View File

@ -79,6 +79,21 @@ struct IndexType
using type = T;
};
/**\brief Specify Launch Bounds for CUDA execution.
*
* The "best" defaults may be architecture specific.
*/
template< unsigned int maxT = 1024 /* Max threads per block */
, unsigned int minB = 1 /* Min blocks per SM */
>
struct LaunchBounds
{
using launch_bounds = LaunchBounds;
using type = LaunchBounds<maxT,minB>;
static unsigned int constexpr maxTperB {maxT};
static unsigned int constexpr minBperSM {minB};
};
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -119,6 +134,7 @@ using Kokkos::is_array_layout ;
KOKKOS_IMPL_IS_CONCEPT( iteration_pattern )
KOKKOS_IMPL_IS_CONCEPT( schedule_type )
KOKKOS_IMPL_IS_CONCEPT( index_type )
KOKKOS_IMPL_IS_CONCEPT( launch_bounds )
}

View File

@ -96,11 +96,13 @@ struct InitArguments {
int num_numa;
int device_id;
InitArguments() {
num_threads = -1;
num_numa = -1;
device_id = -1;
}
InitArguments( int nt = -1
, int nn = -1
, int dv = -1)
: num_threads( nt )
, num_numa( nn )
, device_id( dv )
{}
};
void initialize(int& narg, char* arg[]);
@ -168,6 +170,9 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
} // namespace Kokkos
#include <Kokkos_Crs.hpp>
#include <Kokkos_WorkGraphPolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -51,6 +51,9 @@
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Utilities.hpp>
#include <Kokkos_UniqueToken.hpp>
#include <Kokkos_MasterLock.hpp>
//----------------------------------------------------------------------------
// Have assumed a 64bit build (8byte pointers) throughout the code base.

View File

@ -0,0 +1,333 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CRS_HPP
#define KOKKOS_CRS_HPP
namespace Kokkos {
namespace Experimental {
/// \class Crs
/// \brief Compressed row storage array.
///
/// \tparam DataType The type of stored entries. If a Crs is
/// used as the graph of a sparse matrix, then this is usually an
/// integer type, the type of the column indices in the sparse
/// matrix.
///
/// \tparam Arg1Type The second template parameter, corresponding
/// either to the Device type (if there are no more template
/// parameters) or to the Layout type (if there is at least one more
/// template parameter).
///
/// \tparam Arg2Type The third template parameter, which if provided
/// corresponds to the Device type.
///
/// \tparam SizeType The type of row offsets. Usually the default
/// parameter suffices. However, setting a nondefault value is
/// necessary in some cases, for example, if you want to have a
/// sparse matrices with dimensions (and therefore column indices)
/// that fit in \c int, but want to store more than <tt>INT_MAX</tt>
/// entries in the sparse matrix.
///
/// A row has a range of entries:
/// <ul>
/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
/// <li> <tt> entries( entry , i2 , i3 , ... ); </tt> </li>
/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
/// </ul>
template< class DataType,
class Arg1Type,
class Arg2Type = void,
typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
class Crs {
protected:
typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
public:
typedef DataType data_type;
typedef typename traits::array_layout array_layout;
typedef typename traits::execution_space execution_space;
typedef typename traits::memory_space memory_space;
typedef typename traits::device_type device_type;
typedef SizeType size_type;
typedef Crs< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
typedef Crs< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
typedef View<size_type* , array_layout, device_type> row_map_type;
typedef View<DataType* , array_layout, device_type> entries_type;
entries_type entries;
row_map_type row_map;
//! Construct an empty view.
Crs () : entries(), row_map() {}
//! Copy constructor (shallow copy).
Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
{}
template<class EntriesType, class RowMapType>
Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
{}
/** \brief Assign to a view of the rhs array.
* If the old view is the last view
* then allocated memory is deallocated.
*/
Crs& operator= (const Crs& rhs) {
entries = rhs.entries;
row_map = rhs.row_map;
return *this;
}
/** \brief Destroy this view of the array.
* If the last view then allocated memory is deallocated.
*/
~Crs() {}
/** \brief Return number of rows in the graph
*/
KOKKOS_INLINE_FUNCTION
size_type numRows() const {
return (row_map.dimension_0 () != 0) ?
row_map.dimension_0 () - static_cast<size_type> (1) :
static_cast<size_type> (0);
}
};
/*--------------------------------------------------------------------------*/
template< class OutCounts,
class DataType,
class Arg1Type,
class Arg2Type,
class SizeType>
void get_crs_transpose_counts(
OutCounts& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
std::string const& name = "transpose_counts");
template< class OutCounts,
class InCrs>
void get_crs_row_map_from_counts(
OutCounts& out,
InCrs const& in,
std::string const& name = "row_map");
template< class DataType,
class Arg1Type,
class Arg2Type,
class SizeType>
void transpose_crs(
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
}} // namespace Kokkos::Experimental
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace Experimental {
template <class InCrs, class OutCounts>
class GetCrsTransposeCounts {
public:
using execution_space = typename InCrs::execution_space;
using self_type = GetCrsTransposeCounts<InCrs, OutCounts>;
using index_type = typename InCrs::size_type;
private:
InCrs in;
OutCounts out;
public:
KOKKOS_INLINE_FUNCTION
void operator()(index_type i) const {
atomic_increment( &out[in.entries(i)] );
}
GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out):
in(arg_in),out(arg_out) {
using policy_type = RangePolicy<index_type, execution_space>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, index_type(in.entries.size())));
closure.execute();
execution_space::fence();
}
};
template <class InCounts, class OutRowMap>
class CrsRowMapFromCounts {
public:
using execution_space = typename InCounts::execution_space;
using value_type = typename OutRowMap::value_type;
using index_type = typename InCounts::size_type;
private:
InCounts in;
OutRowMap out;
public:
KOKKOS_INLINE_FUNCTION
void operator()(index_type i, value_type& update, bool final_pass) const {
update += in(i);
if (final_pass) {
out(i + 1) = update;
if (i == 0) {
out(0) = 0;
}
}
}
KOKKOS_INLINE_FUNCTION
void init(value_type& update) const { update = 0; }
KOKKOS_INLINE_FUNCTION
void join(volatile value_type& update, const volatile value_type& input) const {
update += input;
}
using self_type = CrsRowMapFromCounts<InCounts, OutRowMap>;
CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
in(arg_in),out(arg_out) {
using policy_type = RangePolicy<index_type, execution_space>;
using closure_type = Kokkos::Impl::ParallelScan<self_type, policy_type>;
closure_type closure(*this, policy_type(0, in.size()));
closure.execute();
execution_space::fence();
}
};
template <class InCrs, class OutCrs>
class FillCrsTransposeEntries {
public:
using execution_space = typename InCrs::execution_space;
using memory_space = typename InCrs::memory_space;
using value_type = typename OutCrs::entries_type::value_type;
using index_type = typename InCrs::size_type;
private:
using counters_type = View<index_type*, memory_space>;
InCrs in;
OutCrs out;
counters_type counters;
public:
KOKKOS_INLINE_FUNCTION
void operator()(index_type i) const {
auto begin = in.row_map(i);
auto end = in.row_map(i + 1);
for (auto j = begin; j < end; ++j) {
auto ti = in.entries(j);
auto tbegin = out.row_map(ti);
auto tj = atomic_fetch_add( &counters(ti), 1 );
out.entries( tbegin + tj ) = i;
}
}
using self_type = FillCrsTransposeEntries<InCrs, OutCrs>;
FillCrsTransposeEntries(InCrs const& arg_in, OutCrs const& arg_out):
in(arg_in),out(arg_out),
counters("counters", arg_out.numRows()) {
using policy_type = RangePolicy<index_type, execution_space>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, index_type(in.numRows())));
closure.execute();
execution_space::fence();
}
};
}}} // namespace Kokkos::Impl::Experimental
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
template< class OutCounts,
class DataType,
class Arg1Type,
class Arg2Type,
class SizeType>
void get_crs_transpose_counts(
OutCounts& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in,
std::string const& name) {
using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
out = OutCounts(name, in.numRows());
Kokkos::Impl::Experimental::
GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
}
template< class OutRowMap,
class InCounts>
void get_crs_row_map_from_counts(
OutRowMap& out,
InCounts const& in,
std::string const& name) {
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
Kokkos::Impl::Experimental::
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
}
template< class DataType,
class Arg1Type,
class Arg2Type,
class SizeType>
void transpose_crs(
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in)
{
typedef Crs<DataType, Arg1Type, Arg2Type, SizeType> crs_type ;
typedef typename crs_type::memory_space memory_space ;
typedef View<SizeType*, memory_space> counts_type ;
{
counts_type counts;
Kokkos::Experimental::get_crs_transpose_counts(counts, in);
Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
"tranpose_row_map");
}
out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
Kokkos::Impl::Experimental::
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
}
}} // namespace Kokkos::Experimental
#endif /* #define KOKKOS_CRS_HPP */

View File

@ -217,8 +217,8 @@ public:
private:
cudaStream_t m_stream ;
int m_device ;
cudaStream_t m_stream ;
};
} // namespace Kokkos
@ -295,6 +295,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_Cuda_Team.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>
#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------

View File

@ -90,7 +90,7 @@ public:
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return m_name; }
/*--------------------------------*/
/** \brief Error reporting for HostSpace attempt to access CudaSpace */
@ -186,7 +186,7 @@ public:
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return m_name; }
/*--------------------------------*/
@ -234,7 +234,7 @@ public:
, const size_t arg_alloc_size ) const ;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return m_name; }
private:

View File

@ -384,6 +384,7 @@ Impl::PerThreadValue PerThread(const int& arg);
* WorkTag (none): Tag which is used as the first argument for the functor operator.
* Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
* IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
* LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
*/
template< class ... Properties>
class TeamPolicy: public
@ -561,6 +562,45 @@ KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType>
ThreadVectorRange( const TeamMemberType&, const iType& count );
#if defined(KOKKOS_ENABLE_PROFILING)
namespace Impl {
template<typename FunctorType, typename TagType,
bool HasTag = !std::is_same<TagType, void>::value >
struct ParallelConstructName;
template<typename FunctorType, typename TagType>
struct ParallelConstructName<FunctorType, TagType, true> {
ParallelConstructName(std::string const& label):label_ref(label) {
if (label.empty()) {
default_name = std::string(typeid(FunctorType).name()) + "/" +
typeid(TagType).name();
}
}
std::string const& get() {
return (label_ref.empty()) ? default_name : label_ref;
}
std::string const& label_ref;
std::string default_name;
};
template<typename FunctorType, typename TagType>
struct ParallelConstructName<FunctorType, TagType, false> {
ParallelConstructName(std::string const& label):label_ref(label) {
if (label.empty()) {
default_name = std::string(typeid(FunctorType).name());
}
}
std::string const& get() {
return (label_ref.empty()) ? default_name : label_ref;
}
std::string const& label_ref;
std::string default_name;
};
} // namespace Impl
#endif /* defined KOKKOS_ENABLE_PROFILING */
} // namespace Kokkos
#endif /* #define KOKKOS_EXECPOLICY_HPP */

View File

@ -126,14 +126,6 @@ public:
//! This memory space preferred device_type
typedef Kokkos::Device< execution_space, memory_space > device_type;
/*--------------------------------*/
/* Functions unique to the HBWSpace */
static int in_parallel();
static void register_in_parallel( int (*)() );
/*--------------------------------*/
/**\brief Default memory space instance */
HBWSpace();
HBWSpace( const HBWSpace & rhs ) = default;

View File

@ -130,14 +130,6 @@ public:
//! This memory space preferred device_type
typedef Kokkos::Device< execution_space, memory_space > device_type;
/*--------------------------------*/
/* Functions unique to the HostSpace */
static int in_parallel();
static void register_in_parallel( int (*)() );
/*--------------------------------*/
/**\brief Default memory space instance */
HostSpace();
HostSpace( HostSpace && rhs ) = default;
@ -161,7 +153,7 @@ public:
, const size_t arg_alloc_size ) const;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
static constexpr const char* name() { return m_name; }
private:
AllocationMechanism m_alloc_mech;

View File

@ -156,6 +156,8 @@ struct LayoutStride {
for ( int r = 0 ; r < ARRAY_LAYOUT_MAX_RANK ; ++r ) {
tmp.dimension[r] = 0 ;
tmp.stride[r] = 0 ;
}
for ( int r = 0 ; r < rank ; ++r ) {
check_input &= ~int( 1 << order[r] );
}
if ( 0 == check_input ) {

View File

@ -297,6 +297,10 @@
#endif
#endif
#if defined( KOKKOS_ARCH_AVX512MIC )
#define KOKKOS_ENABLE_RFO_PREFETCH 1
#endif
#if defined( __MIC__ )
// Compiling for Xeon Phi
#endif
@ -344,13 +348,18 @@
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#if defined( KOKKOS_ARCH_AVX512MIC )
#define KOKKOS_ENABLE_RFO_PREFETCH 1
#endif
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
( defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ ) )
defined( __x86_64 ) || defined( __x86_64__ ) || \
defined(__PPC64__) )
#define KOKKOS_ENABLE_ASM 1
#endif
#endif

View File

@ -0,0 +1,73 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_MASTER_LOCK_HPP
#define KOKKOS_MASTER_LOCK_HPP
#include <Kokkos_Macros.hpp>
namespace Kokkos { namespace Experimental {
// my be used to coordinate work between master instances
// SHOULD NOT be used within a parallel algorithm
//
// This lock should be used with with a scoped lock guard
// i.e. std::unique_lock<Lock>, std::lock_guard
//
// cannot be copied or moved
// has the following functions available
//
// Lock()
// ~Lock()
//
// void lock()
// void unlock()
// bool try_lock()
//
template <typename ExecutionSpace>
class MasterLock;
}} // namespace Kokkos::Experimental
#endif //KOKKOS_MASTER_LOCK_HPP

View File

@ -66,11 +66,6 @@ private:
enum : uint32_t { max_bit_count_lg2 = CB::max_bit_count_lg2 };
enum : uint32_t { max_bit_count = CB::max_bit_count };
/* Defaults for min block, max block, and superblock sizes */
enum : uint32_t { MIN_BLOCK_SIZE_LG2 = 6 /* 64 bytes */ };
enum : uint32_t { MAX_BLOCK_SIZE_LG2 = 12 /* 4k bytes */ };
enum : uint32_t { SUPERBLOCK_SIZE_LG2 = 16 /* 64k bytes */ };
enum : uint32_t { HINT_PER_BLOCK_SIZE = 2 };
/* Each superblock has a concurrent bitset state
@ -85,6 +80,14 @@ private:
* is concurrently updated.
*/
/* Mapping between block_size <-> block_state
*
* block_state = ( m_sb_size_lg2 - block_size_lg2 ) << state_shift
* block_size = m_sb_size_lg2 - ( block_state >> state_shift )
*
* Thus A_block_size < B_block_size <=> A_block_state > B_block_state
*/
typedef typename DeviceType::memory_space base_memory_space ;
enum { accessible =
@ -251,10 +254,10 @@ public:
* significant runtime performance improvements.
*/
MemoryPool( const base_memory_space & memspace
, const size_t min_total_alloc_size
, const uint32_t min_block_alloc_size // = 1 << MIN_BLOCK_SIZE_LG2
, const uint32_t max_block_alloc_size // = 1 << MAX_BLOCK_SIZE_LG2
, const uint32_t min_superblock_size // = 1 << SUPERBLOCK_SIZE_LG2
, const size_t min_total_alloc_size
, size_t min_block_alloc_size = 0
, size_t max_block_alloc_size = 0
, size_t min_superblock_size = 0
)
: m_tracker()
, m_sb_state_array(0)
@ -267,8 +270,43 @@ public:
, m_data_offset(0)
, m_unused_padding(0)
{
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
// Constraints and defaults:
// min_block_alloc_size <= max_block_alloc_size
// max_block_alloc_size <= min_superblock_size
// min_superblock_size <= min_total_alloc_size
const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ;
const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ;
if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
if ( 0 == max_block_alloc_size ) {
max_block_alloc_size = MAX_BLOCK_SIZE ;
// Upper bound of total allocation size
max_block_alloc_size = std::min( size_t(max_block_alloc_size)
, min_total_alloc_size );
// Lower bound of minimum block size
max_block_alloc_size = std::max( max_block_alloc_size
, min_block_alloc_size );
}
if ( 0 == min_superblock_size ) {
min_superblock_size = max_block_alloc_size ;
// Upper bound of total allocation size
min_superblock_size = std::min( size_t(min_superblock_size)
, min_total_alloc_size );
// Lower bound of maximum block size
min_superblock_size = std::max( min_superblock_size
, max_block_alloc_size );
}
// Block and superblock size is power of two:
@ -435,6 +473,8 @@ public:
void * allocate( size_t alloc_size
, int32_t attempt_limit = 1 ) const noexcept
{
if ( 0 == alloc_size ) return (void*) 0 ;
void * p = 0 ;
const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
@ -444,10 +484,9 @@ public:
// Allocation will fit within a superblock
// that has block sizes ( 1 << block_size_lg2 )
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
const uint32_t block_state = block_count_lg2 << state_shift ;
const uint32_t block_count = 1u << block_count_lg2 ;
const uint32_t block_count_mask = block_count - 1 ;
const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ;
const uint32_t block_state = block_count_lg2 << state_shift ;
const uint32_t block_count = 1u << block_count_lg2 ;
// Superblock hints for this block size:
// hint_sb_id_ptr[0] is the dynamically changing hint
@ -465,7 +504,7 @@ public:
// the guess for which block within a superblock should
// be claimed. If not available then a search occurs.
const uint32_t block_id_hint = block_count_mask &
const uint32_t block_id_hint =
(uint32_t)( Kokkos::Impl::clock_tic()
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
// Spread out potentially concurrent access
@ -474,6 +513,9 @@ public:
#endif
);
// expected state of superblock for allocation
uint32_t sb_state = block_state ;
int32_t sb_id = -1 ;
volatile uint32_t * sb_state_array = 0 ;
@ -484,6 +526,8 @@ public:
if ( sb_id < 0 ) {
// No superblock specified, try the hint for this block size
sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr );
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
@ -493,16 +537,20 @@ public:
// 0 <= sb_id
// sb_state_array == m_sb_state_array + m_sb_state_size * sb_id
if ( block_state == ( state_header_mask & *sb_state_array ) ) {
if ( sb_state == ( state_header_mask & *sb_state_array ) ) {
// This superblock state is assigned to this block size.
// Try to claim a bit.
// This superblock state is as expected, for the moment.
// Attempt to claim a bit. The attempt updates the state
// so have already made sure the state header is as expected.
const uint32_t count_lg2 = sb_state >> state_shift ;
const uint32_t mask = ( 1u << count_lg2 ) - 1 ;
const Kokkos::pair<int,int> result =
CB::acquire_bounded_lg2( sb_state_array
, block_count_lg2
, block_id_hint
, block_state
, count_lg2
, block_id_hint & mask
, sb_state
);
// If result.first < 0 then failed to acquire
@ -512,16 +560,18 @@ public:
if ( 0 <= result.first ) { // acquired a bit
const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ;
// Set the allocated block pointer
p = ((char*)( m_sb_state_array + m_data_offset ))
+ ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+ ( result.first << block_size_lg2 ); // block memory
+ ( result.first << size_lg2 ); // block memory
break ; // Success
}
// printf(" acquire block_count_lg2(%d) block_state(0x%x) sb_id(%d) result(%d,%d)\n" , block_count_lg2 , block_state , sb_id , result.first , result.second );
// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
}
//------------------------------------------------------------------
@ -529,12 +579,18 @@ public:
// Must find a new superblock.
// Start searching at designated index for this block size.
// Look for a partially full superblock of this block size.
// Look for an empty superblock just in case cannot find partfull.
// Look for superblock that, in preferential order,
// 1) part-full superblock of this block size
// 2) empty superblock to claim for this block size
// 3) part-full superblock of the next larger block size
sb_state = block_state ; // Expect to find the desired state
sb_id = -1 ;
bool update_hint = false ;
int32_t sb_id_empty = -1 ;
int32_t sb_id_large = -1 ;
uint32_t sb_state_large = 0 ;
sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ;
@ -544,38 +600,54 @@ public:
// Note that the state may change at any moment
// as concurrent allocations and deallocations occur.
const uint32_t state = *sb_state_array ;
const uint32_t used = state & state_used_mask ;
const uint32_t full_state = *sb_state_array ;
const uint32_t used = full_state & state_used_mask ;
const uint32_t state = full_state & state_header_mask ;
if ( block_state == ( state & state_header_mask ) ) {
if ( state == block_state ) {
// Superblock is assigned to this block size
if ( used < block_count ) {
if ( used < block_count ) {
// There is room to allocate one block
sb_id = id ;
if ( used + 1 < block_count ) {
// Is there room to allocate more than one block?
// There is room to allocate more than one block
Kokkos::atomic_compare_exchange
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
update_hint = used + 1 < block_count ;
break ;
}
}
else if ( ( used == 0 ) && ( sb_id_empty == -1 ) ) {
else if ( 0 == used ) {
// Superblock is not assigned to this block size
// and is the first empty superblock encountered.
// Save this id to use if a partfull superblock is not found.
// Superblock is empty
sb_id_empty = id ;
if ( -1 == sb_id_empty ) {
// Superblock is not assigned to this block size
// and is the first empty superblock encountered.
// Save this id to use if a partfull superblock is not found.
sb_id_empty = id ;
}
}
else if ( ( -1 == sb_id_empty /* have not found an empty */ ) &&
( -1 == sb_id_large /* have not found a larger */ ) &&
( state < block_state /* a larger block */ ) &&
// is not full:
( used < ( 1u << ( state >> state_shift ) ) ) ) {
// First superblock encountered that is
// larger than this block size and
// has room for an allocation.
// Save this id to use of partfull or empty superblock not found
sb_id_large = id ;
sb_state_large = state ;
}
// Iterate around the superblock array:
if ( ++id < m_sb_count ) {
sb_state_array += m_sb_state_size ;
@ -586,7 +658,7 @@ public:
}
}
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d)\n" , m_sb_count , sb_id , sb_id_empty );
// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large);
if ( sb_id < 0 ) {
@ -609,21 +681,31 @@ public:
const uint32_t state_empty = state_header_mask & *sb_state_array ;
if ( state_empty ==
Kokkos::atomic_compare_exchange
(sb_state_array,state_empty,block_state) ) {
// If this thread claims the empty block then update the hint
update_hint =
state_empty ==
Kokkos::atomic_compare_exchange
(sb_state_array,state_empty,block_state);
}
else if ( 0 <= sb_id_large ) {
// If this thread claimed the block then update the hint
// Found a larger superblock with space available
Kokkos::atomic_compare_exchange
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
sb_id = sb_id_large ;
sb_state = sb_state_large ;
sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size );
}
else {
// Did not find a potentially usable superblock
--attempt_limit ;
}
}
if ( update_hint ) {
Kokkos::atomic_compare_exchange
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
} // end allocation attempt loop
//--------------------------------------------------------------------
@ -646,6 +728,8 @@ public:
KOKKOS_INLINE_FUNCTION
void deallocate( void * p , size_t /* alloc_size */ ) const noexcept
{
if ( 0 == p ) return ;
// Determine which superblock and block
const ptrdiff_t d =
((char*)p) - ((char*)( m_sb_state_array + m_data_offset ));

View File

@ -72,11 +72,11 @@ struct MemoryTraits {
//! Tag this class as a kokkos memory traits:
typedef MemoryTraits memory_traits ;
enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) };
enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
enum { Atomic = T & unsigned(Kokkos::Atomic) };
enum { Restrict = T & unsigned(Kokkos::Restrict) };
enum { Aligned = T & unsigned(Kokkos::Aligned) };
enum : bool { Unmanaged = (unsigned(0) != (T & unsigned(Kokkos::Unmanaged))) };
enum : bool { RandomAccess = (unsigned(0) != (T & unsigned(Kokkos::RandomAccess))) };
enum : bool { Atomic = (unsigned(0) != (T & unsigned(Kokkos::Atomic))) };
enum : bool { Restrict = (unsigned(0) != (T & unsigned(Kokkos::Restrict))) };
enum : bool { Aligned = (unsigned(0) != (T & unsigned(Kokkos::Aligned))) };
};
@ -109,7 +109,11 @@ enum { MEMORY_ALIGNMENT =
#else
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
#endif
, MEMORY_ALIGNMENT_THRESHOLD = 4
#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
, MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
#else
, MEMORY_ALIGNMENT_THRESHOLD = 4
#endif
};

View File

@ -47,10 +47,6 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP)
#if !defined(_OPENMP)
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
#endif
#include <Kokkos_Core_fwd.hpp>
#include <cstddef>
@ -67,95 +63,144 @@
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <vector>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class OpenMPExec;
}
/// \class OpenMP
/// \brief Kokkos device for multicore processors in the host memory space.
class OpenMP {
public:
//------------------------------------
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as a kokkos execution space
using execution_space = OpenMP;
using memory_space =
#ifdef KOKKOS_ENABLE_HBWSPACE
using memory_space = Experimental::HBWSpace;
Experimental::HBWSpace;
#else
using memory_space = HostSpace;
HostSpace;
#endif
//! This execution space preferred device_type
using device_type = Kokkos::Device<execution_space,memory_space>;
using array_layout = LayoutRight;
using size_type = memory_space::size_type;
using device_type = Kokkos::Device< execution_space, memory_space >;
using array_layout = LayoutRight;
using size_type = memory_space::size_type;
using scratch_memory_space = ScratchMemorySpace< OpenMP >;
//@}
//------------------------------------
//! \name Functions that all Kokkos execution spaces must implement.
//@{
/// \brief Get a handle to the default execution space instance
inline
OpenMP() noexcept;
inline static bool in_parallel();
// Using omp_get_max_threads(); is problematic
// On Intel (essentially an initial call to the OpenMP runtime
// without a parallel region before will set a process mask for a single core
// The runtime will than bind threads for a parallel region to other cores on the
// entering the first parallel region and make the process mask the aggregate of
// the thread masks. The intend seems to be to make serial code run fast, if you
// compile with OpenMP enabled but don't actually use parallel regions or so
// static int omp_max_threads = omp_get_max_threads();
static int get_current_max_threads() noexcept;
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
static bool sleep();
/// \brief Initialize the default execution space
///
/// if ( thread_count == -1 )
/// then use the number of threads that openmp defaults to
/// if ( thread_count == 0 && Kokkos::hwlow_available() )
/// then use hwloc to choose the number of threads and change
/// the default number of threads
/// if ( thread_count > 0 )
/// then force openmp to use the given number of threads and change
/// the default number of threads
static void initialize( int thread_count = -1 );
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
static bool wake();
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
static void fence() {}
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
/// \brief Free any resources being consumed by the device.
/// \brief Free any resources being consumed by the default execution space
static void finalize();
/** \brief Initialize the device.
*
* 1) If the hardware locality library is enabled and OpenMP has not
* already bound threads then bind OpenMP threads to maximize
* core utilization and group for memory hierarchy locality.
*
* 2) Allocate a HostThread for each OpenMP thread to hold its
* topology and fan in/out data.
*/
static void initialize( unsigned thread_count = 0 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 );
/// \brief is the default execution space initialized for current 'master' thread
static bool is_initialized() noexcept;
static int is_initialized();
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool verbose = false );
/** \brief Return the maximum amount of concurrency. */
static int concurrency();
/// \brief is the instance running a parallel algorithm
inline
static bool in_parallel( OpenMP const& = OpenMP() ) noexcept;
//@}
//------------------------------------
/** \brief This execution space has a topological thread pool which can be queried.
*
* All threads within a pool have a common memory space for which they are cache coherent.
* depth = 0 gives the number of threads in the whole pool.
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
*/
inline static int thread_pool_size( int depth = 0 );
/// \brief Wait until all dispatched functors complete on the given instance
///
/// This is a no-op on OpenMP
inline
static void fence( OpenMP const& = OpenMP() ) noexcept;
/// \brief Does the given instance return immediately after launching
/// a parallel algorithm
///
/// This always returns false on OpenMP
inline
static bool is_asynchronous( OpenMP const& = OpenMP() ) noexcept;
/// \brief Partition the default instance into new instances without creating
/// new masters
///
/// This is a no-op on OpenMP since the default instance cannot be partitioned
/// without promoting other threads to 'master'
static std::vector<OpenMP> partition(...);
/// Non-default instances should be ref-counted so that when the last
/// is destroyed the instance resources are released
///
/// This is a no-op on OpenMP since a non default instance cannot be created
static OpenMP create_instance(...);
/// \brief Partition the default instance and call 'f' on each new 'master' thread
///
/// Func is a functor with the following signiture
/// void( int partition_id, int num_partitions )
template <typename F>
static void partition_master( F const& f
, int requested_num_partitions = 0
, int requested_partition_size = 0
);
inline
static int thread_pool_size() noexcept;
/** \brief The rank of the executing thread in this thread pool */
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
KOKKOS_INLINE_FUNCTION
static int thread_pool_rank() noexcept;
//------------------------------------
#if !defined( KOKKOS_DISABLE_DEPRECATED )
/// \brief Initialize the default execution space
static void initialize( int thread_count,
int use_numa_count,
int use_cores_per_numa = 0);
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
inline
static int thread_pool_size( int depth );
KOKKOS_INLINE_FUNCTION static
unsigned hardware_thread_id() { return thread_pool_rank(); }
static void sleep() {};
static void wake() {};
static const char* name();
// use UniqueToken
static int concurrency();
// use UniqueToken
inline
static int max_hardware_threads() noexcept;
// use UniqueToken
KOKKOS_INLINE_FUNCTION
static int hardware_thread_id() noexcept;
#endif
static constexpr const char* name() noexcept { return "OpenMP"; }
};
} // namespace Kokkos
@ -195,6 +240,7 @@ struct VerifyExecutionCanAccessMemorySpace
/*--------------------------------------------------------------------------*/
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
#include <OpenMP/Kokkos_OpenMP_Team.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMP_Task.hpp>

View File

@ -177,22 +177,23 @@ void parallel_for( const ExecPolicy & policy
)
{
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecPolicy::work_tag> name(str);
Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
Kokkos::Impl::shared_allocation_tracking_enable();
closure.execute();
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
#endif
}
@ -210,14 +211,15 @@ void parallel_for( const size_t work_count
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
Kokkos::Profiling::beginParallelFor(name.get(), 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
Kokkos::Impl::shared_allocation_tracking_enable();
closure.execute();
@ -420,21 +422,22 @@ void parallel_scan( const ExecutionPolicy & policy
{
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Impl::ParallelConstructName<FunctorType, typename ExecutionPolicy::work_tag> name(str);
Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
Kokkos::Impl::shared_allocation_tracking_enable();
closure.execute();
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
#endif
}
@ -453,21 +456,22 @@ void parallel_scan( const size_t work_count
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Impl::ParallelConstructName<FunctorType, void> name(str);
Kokkos::Profiling::beginParallelScan(name.get(), 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Kokkos::Impl::shared_allocation_tracking_disable();
Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
Kokkos::Impl::shared_allocation_tracking_enable();
closure.execute();
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
#endif
}

View File

@ -872,13 +872,14 @@ namespace Impl {
const FunctorType& functor,
ReturnType& return_value) {
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
}
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Impl::ParallelConstructName<FunctorType, typename PolicyType::work_tag> name(label);
Kokkos::Profiling::beginParallelReduce(name.get(), 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Kokkos::Impl::shared_allocation_tracking_disable();
#ifdef KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
Impl::ParallelReduce<typename functor_adaptor::functor_type, PolicyType, typename return_value_adapter::reducer_type >
closure(functor_adaptor::functor(functor),
@ -890,13 +891,13 @@ namespace Impl {
policy,
return_value_adapter::return_value(return_value,functor));
#endif
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
Kokkos::Impl::shared_allocation_tracking_enable();
closure.execute();
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelReduce(kpID);
}
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelReduce(kpID);
}
#endif
}

View File

@ -66,6 +66,7 @@
#include <KokkosExp_MDRangePolicy.hpp>
#include <Kokkos_UniqueToken.hpp>
namespace Kokkos {
@ -526,6 +527,7 @@ public:
}
};
/*--------------------------------------------------------------------------*/
template< class FunctorType , class ... Traits >
@ -604,6 +606,178 @@ public:
{}
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Parallel patterns for Kokkos::Serial with MDRangePolicy */
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::MDRangePolicy< Traits ... > ,
Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ;
void
exec() const
{
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
iterate_type( m_mdr_policy, m_functor )( i );
}
}
public:
inline
void execute() const
{ this->exec(); }
inline
ParallelFor( const FunctorType & arg_functor
, const MDRangePolicy & arg_policy )
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
{}
};
template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef typename ReducerTypeFwd::value_type ValueType;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType
>;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
inline
void
exec( reference_type update ) const
{
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
iterate_type( m_mdr_policy, m_functor, update )( i );
}
}
public:
inline
void execute() const
{
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
const size_t team_reduce_size = 0 ; // Never shrinks
const size_t team_shared_size = 0 ; // Never shrinks
const size_t thread_local_size = 0 ; // Never shrinks
serial_resize_thread_team_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
HostThreadTeamData & data = *serial_get_thread_team_data();
pointer_type ptr =
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
this-> exec( update );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ,
const MDRangePolicy & arg_policy ,
const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{
static_assert( Kokkos::is_view< HostViewType >::value
, "Kokkos::Serial reduce result must be a View" );
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Serial reduce result must be a View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, MDRangePolicy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
} // namespace Kokkos
@ -819,6 +993,60 @@ public:
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos { namespace Experimental {
template<>
class UniqueToken< Serial, UniqueTokenScope::Instance>
{
public:
using execution_space = Serial;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
inline
int size() const noexcept { return 1; }
/// \brief acquire value such that 0 <= value < size()
inline
int acquire() const noexcept { return 0; }
/// \brief release a value acquired by generate
inline
void release( int ) const noexcept {}
};
template<>
class UniqueToken< Serial, UniqueTokenScope::Global>
{
public:
using execution_space = Serial;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
inline
int size() const noexcept { return 1; }
/// \brief acquire value such that 0 <= value < size()
inline
int acquire() const noexcept { return 0; }
/// \brief release a value acquired by generate
inline
void release( int ) const noexcept {}
};
}} // namespace Kokkos::Experimental
#include <impl/Kokkos_Serial_Task.hpp>
#endif // defined( KOKKOS_ENABLE_SERIAL )

View File

@ -148,7 +148,7 @@ private:
typename std::conditional< Arg2_is_space , Arg2 , void
>::type >::type ;
using task_base = Impl::TaskBase< Space , ValueType , void > ;
using task_base = Impl::TaskBase< void , void , void > ;
using queue_type = Impl::TaskQueue< Space > ;
task_base * m_task ;
@ -293,13 +293,17 @@ public:
//----------------------------------------
KOKKOS_INLINE_FUNCTION
typename task_base::get_return_type
int is_ready() const noexcept
{ return ( 0 == m_task ) || ( ((task_base*) task_base::LockTag) == m_task->m_wait ); }
KOKKOS_INLINE_FUNCTION
const typename Impl::TaskResult< ValueType >::reference_type
get() const
{
if ( 0 == m_task ) {
Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
}
return m_task->get();
return Impl::TaskResult< ValueType >::get( m_task );
}
};
@ -396,7 +400,7 @@ private:
using track_type = Kokkos::Impl::SharedAllocationTracker ;
using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
using task_base = Impl::TaskBase< ExecSpace , void , void > ;
using task_base = Impl::TaskBase< void , void , void > ;
track_type m_track ;
queue_type * m_queue ;
@ -464,29 +468,19 @@ public:
KOKKOS_INLINE_FUNCTION
memory_pool * memory() const noexcept
{ return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
{ return m_queue ? &( m_queue->m_memory ) : (memory_pool*) 0 ; }
//----------------------------------------
/**\brief Allocation size for a spawned task */
template< typename FunctorType >
KOKKOS_FUNCTION
size_t spawn_allocation_size() const
{
using task_type = Impl::TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
return m_queue->allocate_block_size( sizeof(task_type) );
}
{ return m_queue->template spawn_allocation_size< FunctorType >(); }
/**\brief Allocation size for a when_all aggregate */
KOKKOS_FUNCTION
size_t when_all_allocation_size( int narg ) const
{
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
}
{ return m_queue->when_all_allocation_size( narg ); }
//----------------------------------------
@ -507,7 +501,7 @@ public:
queue_type * const queue =
arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
arg_policy.m_dependence.m_task
? arg_policy.m_dependence.m_task->m_queue
? static_cast<queue_type*>(arg_policy.m_dependence.m_task->m_queue)
: (queue_type*) 0 );
if ( 0 == queue ) {
@ -530,8 +524,12 @@ public:
future_type f ;
// Allocate task from memory pool
const size_t alloc_size =
queue->template spawn_allocation_size< FunctorType >();
f.m_task =
reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
reinterpret_cast< task_type * >(queue->allocate(alloc_size) );
if ( f.m_task ) {
@ -539,15 +537,17 @@ public:
// Reference count starts at two:
// +1 for the matching decrement when task is complete
// +1 for the future
new ( f.m_task )
task_type( arg_function
, queue
, arg_policy.m_dependence.m_task /* dependence */
, 2 /* reference count */
, int(sizeof(task_type)) /* allocation size */
, int(arg_policy.m_task_type)
, int(arg_policy.m_priority)
, std::move(arg_functor) );
new ( f.m_task ) task_type( std::move(arg_functor) );
f.m_task->m_apply = arg_function ;
f.m_task->m_queue = queue ;
f.m_task->m_next = arg_policy.m_dependence.m_task ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = alloc_size ;
f.m_task->m_task_type = arg_policy.m_task_type ;
f.m_task->m_priority = arg_policy.m_priority ;
Kokkos::memory_fence();
// The dependence (if any) is processed immediately
// within the schedule function, as such the dependence's
@ -586,6 +586,30 @@ public:
// Postcondition: task is in Executing-Respawn state
}
template< typename FunctorType >
KOKKOS_FUNCTION static
void
respawn( FunctorType * arg_self
, TaskScheduler const &
, TaskPriority const & arg_priority
)
{
// Precondition: task is in Executing state
using value_type = typename FunctorType::value_type ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
task_type * const task = static_cast< task_type * >( arg_self );
task->m_priority = static_cast<int>(arg_priority);
task->add_dependence( (task_base*) 0 );
// Postcondition: task is in Executing-Respawn state
}
//----------------------------------------
/**\brief Return a future that is complete
* when all input futures are complete.
@ -596,7 +620,7 @@ public:
when_all( Future< A1 , A2 > const arg[] , int narg )
{
using future_type = Future< execution_space > ;
using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using task_base = Kokkos::Impl::TaskBase< void , void , void > ;
future_type f ;
@ -610,9 +634,9 @@ public:
// Increment reference count to track subsequent assignment.
Kokkos::atomic_increment( &(t->m_ref_count) );
if ( queue == 0 ) {
queue = t->m_queue ;
queue = static_cast< queue_type * >( t->m_queue );
}
else if ( queue != t->m_queue ) {
else if ( queue != static_cast< queue_type * >( t->m_queue ) ) {
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
}
}
@ -620,28 +644,34 @@ public:
if ( queue != 0 ) {
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
size_t const alloc_size = queue->when_all_allocation_size( narg );
f.m_task =
reinterpret_cast< task_base * >( queue->allocate( size ) );
reinterpret_cast< task_base * >( queue->allocate( alloc_size ) );
if ( f.m_task ) {
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
new( f.m_task ) task_base( queue
, 2 /* reference count */
, size /* allocation size */
, narg /* dependence count */
);
new( f.m_task ) task_base();
f.m_task->m_queue = queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = alloc_size ;
f.m_task->m_dep_count = narg ;
f.m_task->m_task_type = task_base::Aggregate ;
// Assign dependences, reference counts were already incremented
task_base ** const dep = f.m_task->aggregate_dependences();
task_base * volatile * const dep =
f.m_task->aggregate_dependences();
for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
Kokkos::memory_fence();
queue->schedule_aggregate( f.m_task );
// this when_all may be processed at any moment
}

View File

@ -0,0 +1,88 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_UNIQUE_TOKEN_HPP
#define KOKKOS_UNIQUE_TOKEN_HPP
#include <Kokkos_Macros.hpp>
namespace Kokkos { namespace Experimental {
enum class UniqueTokenScope : int
{
Instance,
Global
};
/// \brief class to generate unique ids base on the required amount of concurrency
///
/// This object should behave like a ref-counted object, so that when the last
/// instance is destroy resources are free if needed
template <typename ExecutionSpace, UniqueTokenScope = UniqueTokenScope::Instance >
class UniqueToken
{
public:
using execution_space = ExecutionSpace;
using size_type = typename execution_space::size_type;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() );
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
size_type size() const ;
/// \brief acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION
size_type acquire() const ;
/// \brief release a value acquired by generate
KOKKOS_INLINE_FUNCTION
void release( size_type ) const ;
};
}} // namespace Kokkos::Experimental
#endif //KOKKOS_UNIQUE_TOKEN_HPP

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -54,11 +54,14 @@
#include <Kokkos_MemoryTraits.hpp>
#include <Kokkos_ExecPolicy.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template< class DataType >
@ -73,16 +76,6 @@ struct ViewDataAnalysis ;
template< class , class ... >
class ViewMapping { public: enum { is_assignable = false }; };
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
namespace Kokkos {
namespace Impl {
using Kokkos::Experimental::Impl::ViewMapping ;
using Kokkos::Experimental::Impl::ViewDataAnalysis ;
} /* namespace Impl */
} /* namespace Kokkos */
@ -1563,12 +1556,12 @@ namespace Kokkos {
namespace Impl {
inline
void shared_allocation_tracking_claim_and_disable()
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_claim_and_disable(); }
void shared_allocation_tracking_disable()
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_disable(); }
inline
void shared_allocation_tracking_release_and_enable()
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_release_and_enable(); }
void shared_allocation_tracking_enable()
{ Kokkos::Impl::SharedAllocationRecord<void,void>::tracking_enable(); }
} /* namespace Impl */
} /* namespace Kokkos */
@ -1795,6 +1788,20 @@ void deep_copy
if ( (void *) dst.data() != (void*) src.data() ) {
#if defined(KOKKOS_ENABLE_PROFILING)
if (Kokkos::Profiling::profileLibraryLoaded()) {
const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
Kokkos::Profiling::beginDeepCopy(
Kokkos::Profiling::SpaceHandle(dst_memory_space::name()),
dst.label(),
dst.data(),
Kokkos::Profiling::SpaceHandle(src_memory_space::name()),
src.label(),
src.data(),
nbytes);
}
#endif
// Concern: If overlapping views then a parallel copy will be erroneous.
// ...
@ -1882,7 +1889,14 @@ void deep_copy
else {
Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
}
}
#if defined(KOKKOS_ENABLE_PROFILING)
if (Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endDeepCopy();
}
#endif
} // ( (void *) dst.data() != (void*) src.data() )
}
} /* namespace Kokkos */
@ -2249,6 +2263,82 @@ resize( Kokkos::View<T,P...> & v ,
static_assert( Kokkos::ViewTraits<T,P...>::is_managed , "Can only resize managed views" );
// Fix #904 by checking dimensions before actually resizing.
//
// Rank is known at compile time, so hopefully the compiler will
// remove branches that are compile-time false. The upcoming "if
// constexpr" language feature would make this certain.
if (view_type::Rank == 1 &&
n0 == static_cast<size_t> (v.extent(0))) {
return;
}
if (view_type::Rank == 2 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1))) {
return;
}
if (view_type::Rank == 3 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2))) {
return;
}
if (view_type::Rank == 4 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2)) &&
n3 == static_cast<size_t> (v.extent(3))) {
return;
}
if (view_type::Rank == 5 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2)) &&
n3 == static_cast<size_t> (v.extent(3)) &&
n4 == static_cast<size_t> (v.extent(4))) {
return;
}
if (view_type::Rank == 6 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2)) &&
n3 == static_cast<size_t> (v.extent(3)) &&
n4 == static_cast<size_t> (v.extent(4)) &&
n5 == static_cast<size_t> (v.extent(5))) {
return;
}
if (view_type::Rank == 7 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2)) &&
n3 == static_cast<size_t> (v.extent(3)) &&
n4 == static_cast<size_t> (v.extent(4)) &&
n5 == static_cast<size_t> (v.extent(5)) &&
n6 == static_cast<size_t> (v.extent(6))) {
return;
}
if (view_type::Rank == 8 &&
n0 == static_cast<size_t> (v.extent(0)) &&
n1 == static_cast<size_t> (v.extent(1)) &&
n2 == static_cast<size_t> (v.extent(2)) &&
n3 == static_cast<size_t> (v.extent(3)) &&
n4 == static_cast<size_t> (v.extent(4)) &&
n5 == static_cast<size_t> (v.extent(5)) &&
n6 == static_cast<size_t> (v.extent(6)) &&
n7 == static_cast<size_t> (v.extent(7))) {
return;
}
// If Kokkos ever supports Views of rank > 8, the above code won't
// be incorrect, because avoiding reallocation in resize() is just
// an optimization.
// TODO (mfh 27 Jun 2017) If the old View has enough space but just
// different dimensions (e.g., if the product of the dimensions,
// including extra space for alignment, will not change), then
// consider just reusing storage. For now, Kokkos always
// reallocates if any of the dimensions change, even if the old View
// has enough space.
view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
Kokkos::Impl::ViewRemap< view_type , view_type >( v_resized , v );
@ -2317,6 +2407,106 @@ void realloc( Kokkos::View<T,P...> & v ,
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos { namespace Impl {
template < class Specialize, typename A, typename B >
struct CommonViewValueType;
template < typename A, typename B >
struct CommonViewValueType< void, A, B >
{
using value_type = typename std::common_type< A , B >::type;
};
template < class Specialize, class ValueType >
struct CommonViewAllocProp;
template < class ValueType >
struct CommonViewAllocProp< void, ValueType >
{
using value_type = ValueType;
template < class ... Views >
CommonViewAllocProp( const Views & ... ) {}
};
template < class ... Views >
struct DeduceCommonViewAllocProp;
// Base case must provide types for:
// 1. specialize 2. value_type 3. is_view 4. prop_type
template < class FirstView >
struct DeduceCommonViewAllocProp< FirstView >
{
using specialize = typename FirstView::traits::specialize;
using value_type = typename FirstView::traits::value_type;
enum : bool { is_view = is_view< FirstView >::value };
using prop_type = CommonViewAllocProp< specialize, value_type >;
};
template < class FirstView, class ... NextViews >
struct DeduceCommonViewAllocProp< FirstView, NextViews... >
{
using NextTraits = DeduceCommonViewAllocProp< NextViews... >;
using first_specialize = typename FirstView::traits::specialize;
using first_value_type = typename FirstView::traits::value_type;
enum : bool { first_is_view = is_view< FirstView >::value };
using next_specialize = typename NextTraits::specialize;
using next_value_type = typename NextTraits::value_type;
enum : bool { next_is_view = NextTraits::is_view };
// common types
// determine specialize type
// if first and next specialize differ, but are not the same specialize, error out
static_assert( !(!std::is_same< first_specialize, next_specialize >::value && !std::is_same< first_specialize, void>::value && !std::is_same< void, next_specialize >::value) , "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void specialize trait allowed" );
// otherwise choose non-void specialize if either/both are non-void
using specialize = typename std::conditional< std::is_same< first_specialize, next_specialize >::value
, first_specialize
, typename std::conditional< ( std::is_same< first_specialize, void >::value
&& !std::is_same< next_specialize, void >::value)
, next_specialize
, first_specialize
>::type
>::type;
using value_type = typename CommonViewValueType< specialize, first_value_type, next_value_type >::value_type;
enum : bool { is_view = (first_is_view && next_is_view) };
using prop_type = CommonViewAllocProp< specialize, value_type >;
};
} // end namespace Impl
template < class ... Views >
using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...>::prop_type ;
// User function
template < class ... Views >
DeducedCommonPropsType<Views...>
common_view_alloc_prop( Views const & ... views )
{
return DeducedCommonPropsType<Views...>( views... );
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// For backward compatibility:
@ -2350,6 +2540,9 @@ using Kokkos::Impl::WithoutInitializing_t ;
using Kokkos::Impl::AllowPadding_t ;
using Kokkos::Impl::SharedAllocationRecord ;
using Kokkos::Impl::SharedAllocationTracker ;
using Kokkos::Impl::ViewMapping ;
using Kokkos::Impl::ViewDataAnalysis ;
} /* namespace Impl */
} /* namespace Experimental */

View File

@ -0,0 +1,265 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_WORKGRAPHPOLICY_HPP
#define KOKKOS_WORKGRAPHPOLICY_HPP
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec;
}}} // namespace Kokkos::Impl::Experimental
namespace Kokkos {
namespace Experimental {
template< class ... Properties >
class WorkGraphPolicy
{
public:
using self_type = WorkGraphPolicy<Properties ... >;
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
using index_type = typename traits::index_type;
using execution_space = typename traits::execution_space;
using work_tag = typename traits::work_tag;
using memory_space = typename execution_space::memory_space;
using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
using member_type = index_type;
private:
graph_type m_graph;
using ints_type = Kokkos::View<std::int32_t*, memory_space>;
using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
using ranges_type = Kokkos::View<range_type*, memory_space>;
const std::int32_t m_total_work;
ints_type m_counts;
ints_type m_queue;
ranges_type m_ranges;
public:
struct TagZeroRanges {};
KOKKOS_INLINE_FUNCTION
void operator()(TagZeroRanges, std::int32_t i) const {
m_ranges[i] = range_type(0, 0);
}
void zero_ranges() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, 1));
closure.execute();
execution_space::fence();
}
struct TagFillQueue {};
KOKKOS_INLINE_FUNCTION
void operator()(TagFillQueue, std::int32_t i) const {
if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
}
void fill_queue() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, m_total_work));
closure.execute();
execution_space::fence();
}
private:
inline
void setup() {
if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
}
get_crs_transpose_counts(m_counts, m_graph);
m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
deep_copy(m_queue, std::int32_t(-1));
m_ranges = ranges_type("ranges", 1);
fill_queue();
}
KOKKOS_INLINE_FUNCTION
std::int32_t pop_work() const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first + 1 , w.second );
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
if ( w.first < w.second ) { // there was work in the queue
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
// we got a work item
std::int32_t i;
// the push_work function may have incremented the end counter
// but not yet written the work index into the queue.
// wait until the entry is valid.
while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
return i;
} // we got a work item
} else { // there was no work in the queue
#ifdef KOKKOS_DEBUG
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
Kokkos::abort("bug in pop_work");
}
#endif
if (w.first == m_total_work) { // all work is done
return -1;
} else { // need to wait for more work to be pushed
// take a guess that one work item will be pushed
// the key thing is we can't leave (w) alone, because
// otherwise the next compare_exchange may succeed in
// popping work from an empty queue
w.second++;
}
} // there was no work in the queue
} // while (true)
}
KOKKOS_INLINE_FUNCTION
void push_work(std::int32_t i) const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first , w.second + 1 );
// try to increment the end counter
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
// stop trying if the increment was successful
if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
}
// write the work index into the claimed spot in the queue
*((volatile std::int32_t*)(&m_queue( w.second ))) = i;
// push this write out into the memory system
memory_fence();
}
template< class functor_type , class execution_space, class ... policy_args >
friend class Kokkos::Impl::Experimental::WorkGraphExec;
public:
WorkGraphPolicy(graph_type arg_graph)
: m_graph(arg_graph)
, m_total_work( arg_graph.numRows() )
{
setup();
}
};
}} // namespace Kokkos::Experimental
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec
{
public:
using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
using member_type = typename policy_type::member_type;
using memory_space = typename execution_space::memory_space;
protected:
const functor_type m_functor;
const policy_type m_policy;
protected:
KOKKOS_INLINE_FUNCTION
std::int32_t before_work() const {
return m_policy.pop_work();
}
KOKKOS_INLINE_FUNCTION
void after_work(std::int32_t i) const {
/* fence any writes that were done by the work item itself
(usually writing its result to global memory) */
memory_fence();
const std::int32_t begin = m_policy.m_graph.row_map( i );
const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
for (std::int32_t j = begin; j < end; ++j) {
const std::int32_t next = m_policy.m_graph.entries( j );
const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
if ( old_count == 1 ) m_policy.push_work( next );
}
}
inline
WorkGraphExec( const functor_type & arg_functor
, const policy_type & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{
}
};
}}} // namespace Kokkos::Impl::Experimental
#ifdef KOKKOS_ENABLE_SERIAL
#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"
#endif
#ifdef KOKKOS_ENABLE_OPENMP
#include "OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp"
#endif
#ifdef KOKKOS_ENABLE_CUDA
#include "Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp"
#endif
#ifdef KOKKOS_ENABLE_THREADS
#include "Threads/Kokkos_Threads_WorkGraphPolicy.hpp"
#endif
#endif /* #define KOKKOS_WORKGRAPHPOLICY_HPP */

View File

@ -45,75 +45,100 @@
#if defined( KOKKOS_ENABLE_OPENMP )
#include <cstdio>
#include <cstdlib>
#include <limits>
#include <iostream>
#include <vector>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <iostream>
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
namespace Kokkos {
namespace Impl {
namespace {
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel();
int g_openmp_hardware_max_threads = 1;
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
__thread int t_openmp_hardware_id = 0;
__thread Impl::OpenMPExec * t_openmp_instance = nullptr;
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel()
void OpenMPExec::validate_partition( const int nthreads
, int & num_partitions
, int & partition_size
)
{
#ifndef __CUDA_ARCH__
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
#else
return 0;
#endif
if (nthreads == 1) {
num_partitions = 1;
partition_size = 1;
}
else if( num_partitions < 1 && partition_size < 1) {
int idle = nthreads;
for (int np = 2; np <= nthreads ; ++np) {
for (int ps = 1; ps <= nthreads/np; ++ps) {
if (nthreads - np*ps < idle) {
idle = nthreads - np*ps;
num_partitions = np;
partition_size = ps;
}
if (idle == 0) {
break;
}
}
}
}
else if( num_partitions < 1 && partition_size > 0 ) {
if ( partition_size <= nthreads ) {
num_partitions = nthreads / partition_size;
}
else {
num_partitions = 1;
partition_size = nthreads;
}
}
else if( num_partitions > 0 && partition_size < 1 ) {
if ( num_partitions <= nthreads ) {
partition_size = nthreads / num_partitions;
}
else {
num_partitions = nthreads;
partition_size = 1;
}
}
else if ( num_partitions * partition_size > nthreads ) {
int idle = nthreads;
const int NP = num_partitions;
const int PS = partition_size;
for (int np = NP; np > 0; --np) {
for (int ps = PS; ps > 0; --ps) {
if ( (np*ps <= nthreads)
&& (nthreads - np*ps < idle) ) {
idle = nthreads - np*ps;
num_partitions = np;
partition_size = ps;
}
if (idle == 0) {
break;
}
}
}
}
}
bool s_using_hwloc = false;
} // namespace
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
void OpenMPExec::verify_is_process( const char * const label )
void OpenMPExec::verify_is_master( const char * const label )
{
if ( omp_in_parallel() ) {
if ( !t_openmp_instance )
{
std::string msg( label );
msg.append( " ERROR: in parallel" );
msg.append( " ERROR: in parallel or not initialized" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void OpenMPExec::verify_initialized( const char * const label )
{
if ( 0 == m_pool[0] ) {
std::string msg( label );
msg.append( " ERROR: not initialized" );
Kokkos::Impl::throw_runtime_exception( msg );
}
if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
std::string msg( label );
msg.append( " ERROR: Initialized but threads modified inappropriately" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
} // namespace Impl
} // namespace Kokkos
@ -133,11 +158,11 @@ void OpenMPExec::clear_thread_data()
const int old_alloc_bytes =
m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
Kokkos::HostSpace space ;
OpenMP::memory_space space ;
#pragma omp parallel
#pragma omp parallel num_threads( m_pool_size )
{
const int rank = m_map_rank[ omp_get_thread_num() ];
const int rank = omp_get_thread_num();
if ( 0 != m_pool[rank] ) {
@ -189,13 +214,13 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
const int pool_size = omp_get_max_threads();
OpenMP::memory_space space ;
Kokkos::HostSpace space ;
memory_fence();
#pragma omp parallel
#pragma omp parallel num_threads(m_pool_size)
{
const int rank = m_map_rank[ omp_get_thread_num() ];
const int rank = omp_get_thread_num();
if ( 0 != m_pool[rank] ) {
@ -214,11 +239,14 @@ void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
, pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
, thread_local_bytes
);
memory_fence();
}
/* END #pragma omp parallel */
HostThreadTeamData::organize_pool( m_pool , pool_size );
HostThreadTeamData::organize_pool( m_pool , m_pool_size );
}
}
@ -232,16 +260,8 @@ namespace Kokkos {
//----------------------------------------------------------------------------
int OpenMP::is_initialized()
{ return 0 != Impl::OpenMPExec::m_pool[0]; }
void OpenMP::initialize( unsigned thread_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa )
int OpenMP::get_current_max_threads() noexcept
{
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
// Using omp_get_max_threads(); is problematic in conjunction with
// Hwloc on Intel (essentially an initial call to the OpenMP runtime
// without a parallel region before will set a process mask for a single core
@ -250,110 +270,99 @@ void OpenMP::initialize( unsigned thread_count ,
// the thread masks. The intend seems to be to make serial code run fast, if you
// compile with OpenMP enabled but don't actually use parallel regions or so
// static int omp_max_threads = omp_get_max_threads();
int nthreads = 0;
int count = 0;
#pragma omp parallel
{
#pragma omp atomic
nthreads++;
++count;
}
return count;
}
static int omp_max_threads = nthreads;
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
bool thread_spawn_failed = false ;
if ( ! is_initialized ) {
// Use hwloc thread pinning if concerned with locality.
// If spreading threads across multiple NUMA regions.
// If hyperthreading is enabled.
Impl::s_using_hwloc = hwloc::available() && (
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
// If hwloc available then use it's maximum value.
if ( thread_count == 0 ) {
thread_count = Impl::s_using_hwloc
? Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa() *
Kokkos::hwloc::get_available_threads_per_core()
: omp_max_threads ;
}
if(Impl::s_using_hwloc)
hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
false /* do not allow asynchronous */ ,
thread_count ,
use_numa_count ,
use_cores_per_numa ,
threads_coord );
// Spawn threads:
omp_set_num_threads( thread_count );
// Verify OMP interaction:
if ( int(thread_count) != omp_get_max_threads() ) {
thread_spawn_failed = true ;
}
// Verify spawning and bind threads:
#pragma omp parallel
{
#pragma omp critical
{
if ( int(thread_count) != omp_get_num_threads() ) {
thread_spawn_failed = true ;
}
// Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
// Call to 'new' may not be thread safe as well.
const unsigned omp_rank = omp_get_thread_num();
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
: omp_rank ;
Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
}
/* END #pragma omp critical */
}
/* END #pragma omp parallel */
if ( ! thread_spawn_failed ) {
Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
// New, unified host thread team data:
{
size_t pool_reduce_bytes = 32 * thread_count ;
size_t team_reduce_bytes = 32 * thread_count ;
size_t team_shared_bytes = 1024 * thread_count ;
size_t thread_local_bytes = 1024 ;
Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes
);
}
}
}
if ( is_initialized || thread_spawn_failed ) {
std::string msg("Kokkos::OpenMP::initialize ERROR");
if ( is_initialized ) { msg.append(" : already initialized"); }
if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
void OpenMP::initialize( int thread_count )
{
if ( omp_in_parallel() ) {
std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel");
Kokkos::Impl::throw_runtime_exception(msg);
}
if ( Impl::t_openmp_instance )
{
finalize();
}
{
if (nullptr == std::getenv("OMP_PROC_BIND") ) {
printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
printf(" For unit testing set OMP_PROC_BIND=false\n");
}
OpenMP::memory_space space ;
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
Impl::g_openmp_hardware_max_threads = get_current_max_threads();
int process_num_threads = Impl::g_openmp_hardware_max_threads;
if ( Kokkos::hwloc::available() ) {
process_num_threads = Kokkos::hwloc::get_available_numa_count()
* Kokkos::hwloc::get_available_cores_per_numa()
* Kokkos::hwloc::get_available_threads_per_core();
}
// if thread_count < 0, use g_openmp_hardware_max_threads;
// if thread_count == 0, set g_openmp_hardware_max_threads to process_num_threads
// if thread_count > 0, set g_openmp_hardware_max_threads to thread_count
if (thread_count < 0 ) {
thread_count = Impl::g_openmp_hardware_max_threads;
}
else if( thread_count == 0 && Impl::g_openmp_hardware_max_threads != process_num_threads ) {
Impl::g_openmp_hardware_max_threads = process_num_threads;
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
}
else {
if( thread_count > process_num_threads ) {
printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count );
}
Impl::g_openmp_hardware_max_threads = thread_count;
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
}
// setup thread local
#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads)
{
Impl::t_openmp_instance = nullptr;
Impl::t_openmp_hardware_id = omp_get_thread_num();
Impl::SharedAllocationRecord< void, void >::tracking_enable();
}
void * const ptr = space.allocate( sizeof(Impl::OpenMPExec) );
Impl::t_openmp_instance = new (ptr) Impl::OpenMPExec( Impl::g_openmp_hardware_max_threads );
// New, unified host thread team data:
{
size_t pool_reduce_bytes = 32 * thread_count ;
size_t team_reduce_bytes = 32 * thread_count ;
size_t team_shared_bytes = 1024 * thread_count ;
size_t thread_local_bytes = 1024 ;
Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes
);
}
}
// Check for over-subscription
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
// std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
@ -373,20 +382,38 @@ void OpenMP::initialize( unsigned thread_count ,
void OpenMP::finalize()
{
Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
if ( omp_in_parallel() )
{
std::string msg("Kokkos::OpenMP::finalize ERROR ");
if( !Impl::t_openmp_instance ) msg.append(": not initialized");
if( omp_in_parallel() ) msg.append(": in parallel");
Kokkos::Impl::throw_runtime_exception(msg);
}
// New, unified host thread team data:
Impl::OpenMPExec::clear_thread_data();
if ( Impl::t_openmp_instance ) {
Impl::OpenMPExec::m_pool_topo[0] = 0 ;
Impl::OpenMPExec::m_pool_topo[1] = 0 ;
Impl::OpenMPExec::m_pool_topo[2] = 0 ;
const int nthreads = Impl::t_openmp_instance->m_pool_size <= Impl::g_openmp_hardware_max_threads
? Impl::g_openmp_hardware_max_threads
: Impl::t_openmp_instance->m_pool_size;
omp_set_num_threads(1);
using Exec = Impl::OpenMPExec;
Exec * instance = Impl::t_openmp_instance;
instance->~Exec();
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
hwloc::unbind_this_thread();
OpenMP::memory_space space;
space.deallocate( instance, sizeof(Exec) );
#pragma omp parallel num_threads(nthreads)
{
Impl::t_openmp_hardware_id = 0;
Impl::t_openmp_instance = nullptr;
Impl::SharedAllocationRecord< void, void >::tracking_disable();
}
// allow main thread to track
Impl::SharedAllocationRecord< void, void >::tracking_enable();
Impl::g_openmp_hardware_max_threads = 1;
}
#if defined(KOKKOS_ENABLE_PROFILING)
@ -396,70 +423,48 @@ void OpenMP::finalize()
//----------------------------------------------------------------------------
void OpenMP::print_configuration( std::ostream & s , const bool detail )
void OpenMP::print_configuration( std::ostream & s , const bool verbose )
{
Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
s << "Kokkos::OpenMP" ;
#if defined( KOKKOS_ENABLE_OPENMP )
s << " KOKKOS_ENABLE_OPENMP" ;
#endif
#if defined( KOKKOS_ENABLE_HWLOC )
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
;
#endif
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
const bool is_initialized = Impl::t_openmp_instance != nullptr;
if ( is_initialized ) {
const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
Impl::OpenMPExec::verify_is_master( "OpenMP::print_configuration" );
const int numa_count = 1;
const int core_per_numa = Impl::g_openmp_hardware_max_threads;
const int thread_per_core = 1;
s << " thread_pool_topology[ " << numa_count
<< " x " << core_per_numa
<< " x " << thread_per_core
<< " ]"
<< std::endl ;
if ( detail ) {
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
#pragma omp parallel
{
#pragma omp critical
{
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
}
/* END #pragma omp critical */
}
/* END #pragma omp parallel */
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
s << " thread omp_rank[" << i << "]"
<< " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
<< std::endl ;
}
}
}
else {
s << " not initialized" << std::endl ;
}
}
std::vector<OpenMP> OpenMP::partition(...)
{ return std::vector<OpenMP>(1); }
OpenMP OpenMP::create_instance(...) { return OpenMP(); }
#if !defined( KOKKOS_DISABLE_DEPRECATED )
int OpenMP::concurrency() {
return thread_pool_size(0);
return Impl::g_openmp_hardware_max_threads;
}
const char* OpenMP::name() { return "OpenMP"; }
void OpenMP::initialize( int thread_count , int, int )
{
initialize(thread_count);
}
#endif
} // namespace Kokkos

View File

@ -47,6 +47,10 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP )
#if !defined(_OPENMP)
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
#endif
#include <Kokkos_OpenMP.hpp>
#include <impl/Kokkos_Traits.hpp>
@ -54,6 +58,8 @@
#include <Kokkos_Atomic.hpp>
#include <Kokkos_UniqueToken.hpp>
#include <iostream>
#include <sstream>
#include <fstream>
@ -63,8 +69,14 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace Kokkos { namespace Impl {
class OpenMPExec;
extern int g_openmp_hardware_max_threads;
extern __thread int t_openmp_hardware_id;
extern __thread OpenMPExec * t_openmp_instance;
//----------------------------------------------------------------------------
/** \brief Data for OpenMP thread execution */
@ -74,279 +86,279 @@ public:
friend class Kokkos::OpenMP ;
enum { MAX_THREAD_COUNT = 4096 };
enum { MAX_THREAD_COUNT = 512 };
void clear_thread_data();
static void validate_partition( const int nthreads
, int & num_partitions
, int & partition_size
);
private:
OpenMPExec( int arg_pool_size )
: m_pool_size{ arg_pool_size }
, m_level{ omp_get_level() }
, m_pool()
{}
static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ];
~OpenMPExec()
{
clear_thread_data();
}
static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
int m_pool_size;
int m_level;
static
void clear_thread_data();
HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
public:
// Topology of a cache coherent thread pool:
// TOTAL = NUMA x GRAIN
// pool_size( depth = 0 )
// pool_size(0) = total number of threads
// pool_size(1) = number of threads per NUMA
// pool_size(2) = number of threads sharing finest grain memory hierarchy
static void verify_is_master( const char * const );
inline static
int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
static void finalize();
static void initialize( const unsigned team_count ,
const unsigned threads_per_team ,
const unsigned numa_count ,
const unsigned cores_per_numa );
static void verify_is_process( const char * const );
static void verify_initialized( const char * const );
static
void resize_thread_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes );
inline static
HostThreadTeamData * get_thread_data() noexcept
{ return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
inline
HostThreadTeamData * get_thread_data() const noexcept
{ return m_pool[ m_level == omp_get_level() ? 0 : omp_get_thread_num() ]; }
inline static
HostThreadTeamData * get_thread_data( int i ) noexcept
{ return m_pool[i]; }
inline
HostThreadTeamData * get_thread_data( int i ) const noexcept
{ return m_pool[i]; }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class ... Properties >
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
{
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicyInternal execution_policy ;
typedef PolicyTraits<Properties ... > traits;
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size;
m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & ) {
int pool_size = traits::execution_space::thread_pool_size(1);
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
return pool_size<max_host_team_size?pool_size:max_host_team_size;
}
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & )
{ return traits::execution_space::thread_pool_size(2); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType &, const int& )
{ return traits::execution_space::thread_pool_size(2); }
//----------------------------------------
private:
int m_league_size ;
int m_team_size ;
int m_team_alloc ;
int m_team_iter ;
size_t m_team_scratch_size[2];
size_t m_thread_scratch_size[2];
int m_chunk_size;
inline void init( const int league_size_request
, const int team_size_request )
{
const int pool_size = traits::execution_space::thread_pool_size(0);
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
const int team_grain = traits::execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
m_team_size = team_size_request < team_max ?
team_size_request : team_max ;
// Round team size up to a multiple of 'team_gain'
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
const int team_count = pool_size / team_size_grain ;
// Constraint : pool_size = m_team_alloc * team_count
m_team_alloc = pool_size / team_count ;
// Maxumum number of iterations each team will take:
m_team_iter = ( m_league_size + team_count - 1 ) / team_count ;
set_auto_chunk_size();
}
public:
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
if(team_size_ < 0) team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1)
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
TeamPolicyInternal( int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
inline int team_alloc() const { return m_team_alloc ; }
inline int team_iter() const { return m_team_iter ; }
inline int chunk_size() const { return m_chunk_size ; }
/** \brief set chunk_size to a discrete value*/
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
TeamPolicyInternal p = *this;
p.m_chunk_size = chunk_size_;
return p;
}
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
private:
/** \brief finalize chunk_size if it was set to AUTO*/
inline void set_auto_chunk_size() {
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
if( concurrency==0 ) concurrency=1;
if(m_chunk_size > 0) {
if(!Impl::is_integral_power_of_two( m_chunk_size ))
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
}
int new_chunk_size = 1;
while(new_chunk_size*100*concurrency < m_league_size)
new_chunk_size *= 2;
if(new_chunk_size < 128) {
new_chunk_size = 1;
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
new_chunk_size*=2;
}
m_chunk_size = new_chunk_size;
}
public:
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
};
} // namespace Impl
} // namespace Kokkos
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
inline
bool OpenMP::in_parallel()
{ return omp_in_parallel(); }
inline OpenMP::OpenMP() noexcept
{}
inline
int OpenMP::thread_pool_size( int depth )
bool OpenMP::is_initialized() noexcept
{ return Impl::t_openmp_instance != nullptr; }
inline
bool OpenMP::in_parallel( OpenMP const& ) noexcept
{
return Impl::OpenMPExec::pool_size(depth);
//t_openmp_instance is only non-null on a master thread
return !Impl::t_openmp_instance
|| Impl::t_openmp_instance->m_level < omp_get_level()
;
}
inline
int OpenMP::thread_pool_size() noexcept
{
return OpenMP::in_parallel()
? omp_get_num_threads()
: Impl::t_openmp_instance->m_pool_size
;
}
KOKKOS_INLINE_FUNCTION
int OpenMP::thread_pool_rank()
int OpenMP::thread_pool_rank() noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
return Impl::t_openmp_instance ? 0 : omp_get_thread_num();
#else
return -1 ;
#endif
}
inline
void OpenMP::fence( OpenMP const& instance ) noexcept {}
inline
bool OpenMP::is_asynchronous( OpenMP const& instance ) noexcept
{ return false; }
template <typename F>
void OpenMP::partition_master( F const& f
, int num_partitions
, int partition_size
)
{
if (omp_get_nested()) {
using Exec = Impl::OpenMPExec;
Exec * prev_instance = Impl::t_openmp_instance;
Exec::validate_partition( prev_instance->m_pool_size, num_partitions, partition_size );
OpenMP::memory_space space;
#pragma omp parallel num_threads(num_partitions)
{
void * const ptr = space.allocate( sizeof(Exec) );
Impl::t_openmp_instance = new (ptr) Exec( partition_size );
size_t pool_reduce_bytes = 32 * partition_size ;
size_t team_reduce_bytes = 32 * partition_size ;
size_t team_shared_bytes = 1024 * partition_size ;
size_t thread_local_bytes = 1024 ;
Impl::t_openmp_instance->resize_thread_data( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes
);
f( omp_get_thread_num(), omp_get_num_threads() );
Impl::t_openmp_instance->~Exec();
space.deallocate( Impl::t_openmp_instance, sizeof(Exec) );
Impl::t_openmp_instance = nullptr;
}
Impl::t_openmp_instance = prev_instance;
}
else {
// nested openmp not enabled
f(0,1);
}
}
namespace Experimental {
template<>
class MasterLock<OpenMP>
{
public:
void lock() { omp_set_lock( &m_lock ); }
void unlock() { omp_unset_lock( &m_lock ); }
bool try_lock() { return static_cast<bool>(omp_test_lock( &m_lock )); }
MasterLock() { omp_init_lock( &m_lock ); }
~MasterLock() { omp_destroy_lock( &m_lock ); }
MasterLock( MasterLock const& ) = delete;
MasterLock( MasterLock && ) = delete;
MasterLock & operator=( MasterLock const& ) = delete;
MasterLock & operator=( MasterLock && ) = delete;
private:
omp_lock_t m_lock;
};
template<>
class UniqueToken< OpenMP, UniqueTokenScope::Instance>
{
public:
using execution_space = OpenMP;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int size() const noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Kokkos::OpenMP::thread_pool_size();
#else
return 0 ;
#endif
}
/// \brief acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int acquire() const noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Kokkos::OpenMP::thread_pool_rank();
#else
return 0 ;
#endif
}
/// \brief release a value acquired by generate
KOKKOS_INLINE_FUNCTION
void release( int ) const noexcept {}
};
template<>
class UniqueToken< OpenMP, UniqueTokenScope::Global>
{
public:
using execution_space = OpenMP;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int size() const noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Kokkos::Impl::g_openmp_hardware_max_threads ;
#else
return 0 ;
#endif
}
/// \brief acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION
int acquire() const noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Kokkos::Impl::t_openmp_hardware_id ;
#else
return 0 ;
#endif
}
/// \brief release a value acquired by generate
KOKKOS_INLINE_FUNCTION
void release( int ) const noexcept {}
};
} // namespace Experimental
#if !defined( KOKKOS_DISABLE_DEPRECATED )
inline
int OpenMP::thread_pool_size( int depth )
{
return depth < 2
? thread_pool_size()
: 1;
}
KOKKOS_INLINE_FUNCTION
int OpenMP::hardware_thread_id() noexcept
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Impl::t_openmp_hardware_id;
#else
return -1 ;
#endif
}
inline
int OpenMP::max_hardware_threads() noexcept
{
return Impl::g_openmp_hardware_max_threads;
}
#endif // KOKKOS_DISABLE_DEPRECATED
} // namespace Kokkos
#endif

View File

@ -52,6 +52,8 @@
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -71,8 +73,9 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
OpenMPExec * m_instance ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
inline static
@ -110,16 +113,120 @@ private:
public:
inline void execute() const
{
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value
};
if ( OpenMP::in_parallel() ) {
exec_range< WorkTag >( m_functor
, m_policy.begin()
, m_policy.end() );
}
else {
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *(m_instance->get_thread_data());
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
if ( is_dynamic ) {
// Make sure work partition is set before stealing
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
std::pair<int64_t,int64_t> range(0,0);
do {
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
ParallelFor::template
exec_range< WorkTag >( m_functor
, range.first + m_policy.begin()
, range.second + m_policy.begin() );
} while ( is_dynamic && 0 <= range.first );
}
}
}
inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
OpenMPExec * m_instance ;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
inline static
void
exec_range( const MDRangePolicy & mdr_policy
, const FunctorType & functor
, const Member ibeg , const Member iend )
{
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
iterate_type( mdr_policy, functor )( iwork );
}
}
public:
inline void execute() const
{
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
if ( OpenMP::in_parallel() ) {
ParallelFor::exec_range ( m_mdr_policy
, m_functor
, m_policy.begin()
, m_policy.end() );
}
else {
#pragma omp parallel
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
HostThreadTeamData & data = *(m_instance->get_thread_data());
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
@ -136,8 +243,8 @@ public:
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
ParallelFor::template
exec_range< WorkTag >( m_functor
ParallelFor::exec_range( m_mdr_policy
, m_functor
, range.first + m_policy.begin()
, range.second + m_policy.begin() );
@ -145,12 +252,15 @@ public:
}
// END #pragma omp parallel
}
}
inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, MDRangePolicy arg_policy )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
{}
};
@ -191,10 +301,11 @@ private:
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
OpenMPExec * m_instance;
const FunctorType m_functor;
const Policy m_policy;
const ReducerType m_reducer;
const pointer_type m_result_ptr;
template< class TagType >
inline static
@ -228,21 +339,21 @@ public:
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
const size_t pool_reduce_bytes =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
OpenMPExec::resize_thread_data( pool_reduce_bytes
m_instance->resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
);
#pragma omp parallel
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
HostThreadTeamData & data = *(m_instance->get_thread_data());
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
@ -271,16 +382,15 @@ public:
} while ( is_dynamic && 0 <= range.first );
}
// END #pragma omp parallel
// Reduction:
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
for ( int i = 1 ; i < pool_size ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
, m_instance->get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -303,7 +413,8 @@ public:
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_view.data() )
@ -317,7 +428,8 @@ public:
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
@ -329,6 +441,173 @@ public:
};
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ...>
, ReducerType
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef typename ReducerTypeFwd::value_type ValueType;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType
>;
OpenMPExec * m_instance ;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
inline static
void
exec_range( const MDRangePolicy & mdr_policy
, const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update )
{
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
iterate_type( mdr_policy, functor, update )( iwork );
}
}
public:
inline void execute() const
{
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
const size_t pool_reduce_bytes =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
m_instance->resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
);
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *(m_instance->get_thread_data());
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
if ( is_dynamic ) {
// Make sure work partition is set before stealing
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
, data.pool_reduce_local() );
std::pair<int64_t,int64_t> range(0,0);
do {
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
ParallelReduce::exec_range ( m_mdr_policy, m_functor
, range.first + m_policy.begin()
, range.second + m_policy.begin()
, update );
} while ( is_dynamic && 0 <= range.first );
}
// END #pragma omp parallel
// Reduction:
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < pool_size ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, m_instance->get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
}
//----------------------------------------
template< class ViewType >
inline
ParallelReduce( const FunctorType & arg_functor
, MDRangePolicy arg_policy
, const ViewType & arg_view
, typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( InvalidType() )
, m_result_ptr( arg_view.data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
inline
ParallelReduce( const FunctorType & arg_functor
, MDRangePolicy arg_policy
, const ReducerType& reducer )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
} // namespace Kokkos
@ -361,8 +640,9 @@ private:
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
OpenMPExec * m_instance;
const FunctorType m_functor;
const Policy m_policy;
template< class TagType >
inline static
@ -394,23 +674,23 @@ public:
inline
void execute() const
{
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_scan");
const int value_count = Analysis::value_count( m_functor );
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
OpenMPExec::resize_thread_data( pool_reduce_bytes
m_instance->resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
);
#pragma omp parallel
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
HostThreadTeamData & data = *(m_instance->get_thread_data());
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
const WorkRange range( m_policy, omp_get_thread_num(), omp_get_num_threads() );
reference_type update_sum =
ValueInit::init( m_functor , data.pool_reduce_local() );
@ -422,7 +702,7 @@ public:
pointer_type ptr_prev = 0 ;
const int n = data.pool_size();
const int n = omp_get_num_threads();
for ( int i = 0 ; i < n ; ++i ) {
@ -452,7 +732,6 @@ public:
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end() , update_base , true );
}
/* END #pragma omp parallel */
}
@ -461,7 +740,8 @@ public:
inline
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
{}
@ -492,9 +772,10 @@ private:
typedef typename Policy::schedule_type::type SchedTag ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
const int m_shmem_size ;
OpenMPExec * m_instance;
const FunctorType m_functor;
const Policy m_policy;
const int m_shmem_size;
template< class TagType >
inline static
@ -548,22 +829,22 @@ public:
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_for");
const size_t pool_reduce_size = 0 ; // Never shrinks
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPExec::resize_thread_data( pool_reduce_size
m_instance->resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
HostThreadTeamData & data = *(m_instance->get_thread_data());
const int active = data.organize_team( m_policy.team_size() );
@ -598,14 +879,14 @@ public:
data.disband_team();
}
// END #pragma omp parallel
}
inline
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
@ -646,11 +927,12 @@ private:
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const int m_shmem_size ;
OpenMPExec * m_instance;
const FunctorType m_functor;
const Policy m_policy;
const ReducerType m_reducer;
const pointer_type m_result_ptr;
const int m_shmem_size;
template< class TagType >
inline static
@ -706,8 +988,7 @@ public:
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_is_master("Kokkos::OpenMP parallel_reduce");
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
@ -716,14 +997,15 @@ public:
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPExec::resize_thread_data( pool_reduce_size
m_instance->resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
HostThreadTeamData & data = *(m_instance->get_thread_data());
const int active = data.organize_team( m_policy.team_size() );
@ -763,17 +1045,26 @@ public:
}
data.disband_team();
// This thread has updated 'pool_reduce_local()' with its
// contributions to the reduction. The parallel region is
// about to terminate and the master thread will load and
// reduce each 'pool_reduce_local()' contribution.
// Must 'memory_fence()' to guarantee that storing the update to
// 'pool_reduce_local()' will complete before this thread
// exits the parallel region.
memory_fence();
}
// END #pragma omp parallel
// Reduction:
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
const pointer_type ptr = pointer_type( m_instance->get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
for ( int i = 1 ; i < pool_size ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
, m_instance->get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -796,7 +1087,8 @@ public:
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
@ -810,7 +1102,8 @@ public:
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
: m_instance( t_openmp_instance )
, m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )

View File

@ -105,7 +105,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
static task_root_type * const end =
@ -115,23 +115,19 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
HostThreadTeamData & team_data_single =
HostThreadTeamDataSingleton::singleton();
const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
// const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
Impl::OpenMPExec * instance = t_openmp_instance;
const int pool_size = OpenMP::thread_pool_size();
#if 0
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
fflush(stdout);
#endif
const int team_size = 1; // Threads per core
instance->resize_thread_data( 0 /* global reduce buffer */
, 512 * team_size /* team reduce buffer */
, 0 /* team shared buffer */
, 0 /* thread local buffer */
);
OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
, 512 * team_size /* team reduce buffer */
, 0 /* team shared buffer */
, 0 /* thread local buffer */
);
#pragma omp parallel
#pragma omp parallel num_threads(pool_size)
{
Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
Impl::HostThreadTeamData & self = *(instance->get_thread_data());
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
@ -142,18 +138,6 @@ fflush(stdout);
Member single_exec( team_data_single );
Member team_exec( self );
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
);
fflush(stdout);
#endif
// Loop until all queues are empty and no tasks in flight
task_root_type * task = 0 ;
@ -197,15 +181,6 @@ fflush(stdout);
// if a single thread task then execute now
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
, self.pool_rank()
, self.pool_size()
, int64_t(task)
);
fflush(stdout);
#endif
(*task->m_apply)( task , & single_exec );
leader_loop = true ;
@ -220,57 +195,14 @@ fflush(stdout);
if ( 0 != task ) { // Thread Team Task
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
, int64_t(task)
);
fflush(stdout);
#endif
(*task->m_apply)( task , & team_exec );
// The m_apply function performs a barrier
}
} while( 0 != task );
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
);
fflush(stdout);
#endif
}
self.disband_team();
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
, self.pool_rank()
, self.pool_size()
);
fflush(stdout);
#endif
}
// END #pragma omp parallel
#if 0
fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
fflush(stdout);
#endif
}
void TaskQueueSpecialization< Kokkos::OpenMP >::
@ -279,10 +211,10 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
if ( 1 == OpenMP::thread_pool_size() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

View File

@ -45,7 +45,7 @@
#define KOKKOS_IMPL_OPENMP_TASK_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -60,7 +60,7 @@ public:
using execution_space = Kokkos::OpenMP ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ;
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
// Must specify memory space

View File

@ -0,0 +1,245 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMP_TEAM_HPP
#define KOKKOS_OPENMP_TEAM_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP )
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
namespace Kokkos { namespace Impl {
template< class ... Properties >
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
{
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicyInternal execution_policy ;
typedef PolicyTraits<Properties ... > traits;
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size;
m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & ) {
int pool_size = traits::execution_space::thread_pool_size(1);
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
return pool_size<max_host_team_size?pool_size:max_host_team_size;
}
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & )
{ return traits::execution_space::thread_pool_size(2); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType &, const int& )
{ return traits::execution_space::thread_pool_size(2); }
//----------------------------------------
private:
int m_league_size ;
int m_team_size ;
int m_team_alloc ;
int m_team_iter ;
size_t m_team_scratch_size[2];
size_t m_thread_scratch_size[2];
int m_chunk_size;
inline void init( const int league_size_request
, const int team_size_request )
{
const int pool_size = traits::execution_space::thread_pool_size(0);
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
const int team_grain = traits::execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
m_team_size = team_size_request < team_max ?
team_size_request : team_max ;
// Round team size up to a multiple of 'team_gain'
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
const int team_count = pool_size / team_size_grain ;
// Constraint : pool_size = m_team_alloc * team_count
m_team_alloc = pool_size / team_count ;
// Maxumum number of iterations each team will take:
m_team_iter = ( m_league_size + team_count - 1 ) / team_count ;
set_auto_chunk_size();
}
public:
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
if(team_size_ < 0) team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1)
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
TeamPolicyInternal( int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request ); }
TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
inline int team_alloc() const { return m_team_alloc ; }
inline int team_iter() const { return m_team_iter ; }
inline int chunk_size() const { return m_chunk_size ; }
/** \brief set chunk_size to a discrete value*/
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
TeamPolicyInternal p = *this;
p.m_chunk_size = chunk_size_;
return p;
}
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
private:
/** \brief finalize chunk_size if it was set to AUTO*/
inline void set_auto_chunk_size() {
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
if( concurrency==0 ) concurrency=1;
if(m_chunk_size > 0) {
if(!Impl::is_integral_power_of_two( m_chunk_size ))
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
}
int new_chunk_size = 1;
while(new_chunk_size*100*concurrency < m_league_size)
new_chunk_size *= 2;
if(new_chunk_size < 128) {
new_chunk_size = 1;
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
new_chunk_size*=2;
}
m_chunk_size = new_chunk_size;
}
public:
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
};
}} // namespace Kokkos::Impl
#endif
#endif /* KOKKOS_OPENMP_TEAM_HPP */

View File

@ -0,0 +1,107 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
#define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::OpenMP
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::OpenMP,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
public:
inline
void execute()
{
const int pool_size = OpenMP::thread_pool_size();
#pragma omp parallel num_threads(pool_size)
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
}
}
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,7 +45,7 @@
#define KOKKOS_OPENMPTARGETEXEC_HPP
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_Spinwait.hpp>
#include <Kokkos_Atomic.hpp>
#include <iostream>
@ -59,10 +59,10 @@ namespace Impl {
class OpenMPTargetExec {
public:
public:
enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
private:
static void* scratch_ptr;
@ -70,7 +70,7 @@ public:
static void verify_is_process( const char * const );
static void verify_initialized( const char * const );
static void* get_scratch_ptr();
static void* get_scratch_ptr();
static void clear_scratch();
static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
@ -159,7 +159,7 @@ public:
KOKKOS_INLINE_FUNCTION void team_barrier() const
{
#pragma omp barrier
#pragma omp barrier
}
template<class ValueType>
@ -191,13 +191,13 @@ public:
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
for(int i = m_team_rank; i < n_values; i+= m_team_size) {
team_scratch[i] = value_type();
}
@ -209,7 +209,7 @@ public:
team_scratch[m_team_rank%n_values]+=value;
#pragma omp barrier
}
for(int d = 1; d<n_values;d*=2) {
if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
@ -374,12 +374,12 @@ private:
int m_chunk_size;
inline void init( const int league_size_request
, const int team_size_request
, const int team_size_request
, const int vector_length_request )
{
m_league_size = league_size_request ;
m_team_size = team_size_request;
m_team_size = team_size_request;
m_vector_length = vector_length_request;

View File

@ -47,7 +47,7 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_Spinwait.hpp>
//----------------------------------------------------------------------------

View File

@ -45,14 +45,14 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#include <Kokkos_Core_fwd.hpp>
#include <cstdint>
#include <limits>
#include <utility>
#include <iostream>
#include <sstream>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -80,9 +80,7 @@ const void * volatile s_current_function_arg = 0 ;
struct Sentinel {
Sentinel()
{
HostSpace::register_in_parallel( ThreadsExec::in_parallel );
}
{}
~Sentinel()
{
@ -122,6 +120,8 @@ void execute_function_noop( ThreadsExec & , const void * ) {}
void ThreadsExec::driver(void)
{
SharedAllocationRecord< void, void >::tracking_enable();
ThreadsExec this_thread ;
while ( ThreadsExec::Active == this_thread.m_pool_state ) {
@ -726,6 +726,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
Impl::SharedAllocationRecord< void, void >::tracking_enable();
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif

View File

@ -50,11 +50,12 @@
#include <cstdio>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_UniqueToken.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
@ -275,6 +276,17 @@ public:
if ( ! rev_rank ) {
Final::final( f , reduce_memory() );
}
// This thread has updated 'reduce_memory()' and upon returning
// from this function will set 'm_pool_state' to inactive.
// If this is a non-root thread then setting 'm_pool_state'
// to inactive triggers another thread to exit a spinwait
// and read the 'reduce_memory'.
// Must 'memory_fence()' to guarantee that storing the update to
// 'reduce_memory()' will complete before storing the the update to
// 'm_pool_state'.
memory_fence();
}
inline
@ -627,6 +639,62 @@ inline void Threads::fence()
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos { namespace Experimental {
template<>
class UniqueToken< Threads, UniqueTokenScope::Instance>
{
public:
using execution_space = Threads;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
inline
int size() const noexcept { return Threads::thread_pool_size(); }
/// \brief acquire value such that 0 <= value < size()
inline
int acquire() const noexcept { return Threads::thread_pool_rank(); }
/// \brief release a value acquired by generate
inline
void release( int ) const noexcept {}
};
template<>
class UniqueToken< Threads, UniqueTokenScope::Global>
{
public:
using execution_space = Threads;
using size_type = int;
/// \brief create object size for concurrency on the given instance
///
/// This object should not be shared between instances
UniqueToken( execution_space const& = execution_space() ) noexcept {}
/// \brief upper bound for acquired values, i.e. 0 <= value < size()
inline
int size() const noexcept { return Threads::thread_pool_size(); }
/// \brief acquire value such that 0 <= value < size()
inline
int acquire() const noexcept { return Threads::thread_pool_rank(); }
/// \brief release a value acquired by generate
inline
void release( int ) const noexcept {}
};
}} // namespace Kokkos::Experimental
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif

View File

@ -50,7 +50,7 @@
#include <cstdio>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
@ -482,6 +482,8 @@ public:
void next_static()
{
if ( m_league_rank < m_league_end ) {
// Make sure all stores are complete before entering the barrier
memory_fence();
team_barrier();
set_team_shared();
}
@ -518,6 +520,8 @@ public:
return;
if ( m_league_rank < m_league_chunk_end ) {
// Make sure all stores are complete before entering the barrier
memory_fence();
team_barrier();
set_team_shared();
}

View File

@ -55,6 +55,8 @@
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
@ -174,6 +176,108 @@ public:
{}
};
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
inline static
void
exec_range( const MDRangePolicy & mdr_policy
, const FunctorType & functor
, const Member ibeg , const Member iend )
{
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
iterate_type( mdr_policy, functor )( i );
}
}
static void exec( ThreadsExec & exec , const void * arg )
{
exec_schedule<typename Policy::schedule_type::type>(exec,arg);
}
template<class Schedule>
static
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
exec_schedule( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
ParallelFor::exec_range
( self.m_mdr_policy, self.m_functor , range.begin() , range.end() );
exec.fan_in();
}
template<class Schedule>
static
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
exec_schedule( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
exec.reset_steal_target();
exec.barrier();
long work_index = exec.get_work_index();
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
ParallelFor::exec_range
( self.m_mdr_policy, self.m_functor , begin , end );
work_index = exec.get_work_index();
}
exec.fan_in();
}
public:
inline
void execute() const
{
ThreadsExec::start( & ParallelFor::exec , this );
ThreadsExec::fence();
}
ParallelFor( const FunctorType & arg_functor
, const MDRangePolicy & arg_policy )
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
{}
};
//----------------------------------------------------------------------------
/* ParallelFor Kokkos::Threads with TeamPolicy */
@ -440,6 +544,169 @@ public:
};
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef typename ReducerTypeFwd::value_type ValueType;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType
>;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
const Policy m_policy ; // construct as RangePolicy( 0, num_tiles ).set_chunk_size(1) in ctor
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
inline static
void
exec_range( const MDRangePolicy & mdr_policy
, const FunctorType & functor
, const Member & ibeg , const Member & iend
, reference_type update )
{
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
iterate_type( mdr_policy, functor, update )( i );
}
}
static void
exec( ThreadsExec & exec , const void * arg ) {
exec_schedule<typename Policy::schedule_type::type>(exec, arg);
}
template<class Schedule>
static
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
exec_schedule( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::exec_range
( self.m_mdr_policy, self.m_functor , range.begin() , range.end()
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
}
template<class Schedule>
static
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
exec_schedule( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
exec.set_work_range(range.begin(),range.end(),self.m_policy.chunk_size());
exec.reset_steal_target();
exec.barrier();
long work_index = exec.get_work_index();
reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
ParallelReduce::exec_range
( self.m_mdr_policy, self.m_functor , begin , end
, update );
work_index = exec.get_work_index();
}
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
}
public:
inline
void execute() const
{
ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
ThreadsExec::start( & ParallelReduce::exec , this );
ThreadsExec::fence();
if ( m_result_ptr ) {
const pointer_type data =
(pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ,
const MDRangePolicy & arg_policy ,
const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() )
{
static_assert( Kokkos::is_view< HostViewType >::value
, "Kokkos::Threads reduce result must be a View" );
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Threads reduce result must be a View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, MDRangePolicy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_mdr_policy( arg_policy )
, m_policy( Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1) )
, m_reducer( reducer )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
};
//----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and TeamPolicy */

View File

@ -0,0 +1,115 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
#define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::Threads
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Threads,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
typedef ParallelFor<FunctorType,
Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
Kokkos::Threads> Self ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
inline void exec_one_thread() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
}
}
static inline void thread_main( ThreadsExec&, const void* arg ) {
const Self& self = *(static_cast<const Self*>(arg));
self.exec_one_thread();
}
public:
inline
void execute()
{
ThreadsExec::start( & Self::thread_main, this );
ThreadsExec::fence();
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_THREADS_WORKGRAPHPOLICY_HPP */

View File

@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
// New Loop Macros...
// parallel_for, non-tagged
#define APPLY( func, ... ) \
@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
// end tagged macros
// Structs for calling loops
template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
struct Tile_Loop_Type;
@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
template <typename T>
using is_void = std::is_same< T , void >;
template <typename T>
struct is_type_array : std::false_type
{
using value_type = T;
};
template <typename T>
struct is_type_array< T[] > : std::true_type
{
using value_type = T;
};
template < typename RP
, typename Functor
, typename Tag = void
@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
RP const& m_rp;
Functor const& m_func;
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
// value_type & m_v;
};
// ValueType: For reductions
// For ParallelReduce
// ValueType - scalar: For reductions
template < typename RP
, typename Functor
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
};
// For ParallelReduce
// Extra specialization for array reductions
// ValueType[]: For array reductions
template < typename RP
, typename Functor
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
inline
HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
: m_rp(rp) //Cuda 7.0 does not like braces...
, m_func(func)
, m_v(v) // use with non-void ValueType struct
{}
inline
bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
bool is_full_tile = true;
for ( int i = 0; i < RP::rank; ++i ) {
if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
partial_tile[i] = m_rp.m_tile[i] ;
}
else {
is_full_tile = false ;
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
}
}
return is_full_tile ;
} // end check bounds
template <int Rank>
struct RankTag
{
typedef RankTag type;
enum { value = (int)Rank };
};
#if KOKKOS_ENABLE_NEW_LOOP_MACROS
template <typename IType>
inline
void
operator()(IType tile_idx) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
}
#else
template <typename IType>
inline
void
operator()(IType tile_idx) const
{ operator_impl( tile_idx , RankTag<RP::rank>() ); }
// added due to compiler error when using sfinae to choose operator based on rank
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<2> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_2L(index_type, m_tiledims) {
apply( LOOP_ARGS_2 );
}
} else {
// #pragma simd
LOOP_2L(index_type, m_tiledims) {
apply( LOOP_ARGS_2 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_2R(index_type, m_tiledims) {
apply( LOOP_ARGS_2 );
}
} else {
// #pragma simd
LOOP_2R(index_type, m_tiledims) {
apply( LOOP_ARGS_2 );
}
}
} // end RP::Right
} //end op() rank == 2
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<3> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_3L(index_type, m_tiledims) {
apply( LOOP_ARGS_3 );
}
} else {
// #pragma simd
LOOP_3L(index_type, m_tiledims) {
apply( LOOP_ARGS_3 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_3R(index_type, m_tiledims) {
apply( LOOP_ARGS_3 );
}
} else {
// #pragma simd
LOOP_3R(index_type, m_tiledims) {
apply( LOOP_ARGS_3 );
}
}
} // end RP::Right
} //end op() rank == 3
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<4> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_4L(index_type, m_tiledims) {
apply( LOOP_ARGS_4 );
}
} else {
// #pragma simd
LOOP_4L(index_type, m_tiledims) {
apply( LOOP_ARGS_4 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_4R(index_type, m_tiledims) {
apply( LOOP_ARGS_4 );
}
} else {
// #pragma simd
LOOP_4R(index_type, m_tiledims) {
apply( LOOP_ARGS_4 );
}
}
} // end RP::Right
} //end op() rank == 4
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<5> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_5L(index_type, m_tiledims) {
apply( LOOP_ARGS_5 );
}
} else {
// #pragma simd
LOOP_5L(index_type, m_tiledims) {
apply( LOOP_ARGS_5 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_5R(index_type, m_tiledims) {
apply( LOOP_ARGS_5 );
}
} else {
// #pragma simd
LOOP_5R(index_type, m_tiledims) {
apply( LOOP_ARGS_5 );
}
}
} // end RP::Right
} //end op() rank == 5
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<6> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_6L(index_type, m_tiledims) {
apply( LOOP_ARGS_6 );
}
} else {
// #pragma simd
LOOP_6L(index_type, m_tiledims) {
apply( LOOP_ARGS_6 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_6R(index_type, m_tiledims) {
apply( LOOP_ARGS_6 );
}
} else {
// #pragma simd
LOOP_6R(index_type, m_tiledims) {
apply( LOOP_ARGS_6 );
}
}
} // end RP::Right
} //end op() rank == 6
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<7> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_7L(index_type, m_tiledims) {
apply( LOOP_ARGS_7 );
}
} else {
// #pragma simd
LOOP_7L(index_type, m_tiledims) {
apply( LOOP_ARGS_7 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_7R(index_type, m_tiledims) {
apply( LOOP_ARGS_7 );
}
} else {
// #pragma simd
LOOP_7R(index_type, m_tiledims) {
apply( LOOP_ARGS_7 );
}
}
} // end RP::Right
} //end op() rank == 7
template <typename IType>
inline
void operator_impl( IType tile_idx , const RankTag<8> ) const
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
else {
for (int i=RP::rank-1; i>=0; --i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
// #pragma simd
LOOP_8L(index_type, m_tiledims) {
apply( LOOP_ARGS_8 );
}
} else {
// #pragma simd
LOOP_8L(index_type, m_tiledims) {
apply( LOOP_ARGS_8 );
}
}
} // end RP::Left
else {
if ( full_tile ) {
// #pragma simd
LOOP_8R(index_type, m_tiledims) {
apply( LOOP_ARGS_8 );
}
} else {
// #pragma simd
LOOP_8R(index_type, m_tiledims) {
apply( LOOP_ARGS_8 );
}
}
} // end RP::Right
} //end op() rank == 8
#endif
template <typename... Args>
typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
apply(Args &&... args) const
{
m_func(args... , m_v);
}
template <typename... Args>
typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
apply(Args &&... args) const
{
m_func( m_tag, args... , m_v);
}
RP const& m_rp;
Functor const& m_func;
value_type * m_v;
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
};
// ------------------------------------------------------------------ //
// MDFunctor - wraps the range_policy and functor to pass to IterateTile
// Serial, Threads, OpenMP
// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
// Cuda uses DeviceIterateTile directly within md_parallel_for
// ParallelReduce
// TODO Once md_parallel_{for,reduce} removed, this can be removed
// ParallelReduce - scalar reductions
template < typename MDRange, typename Functor, typename ValueType = void >
struct MDFunctor
{
@ -2273,7 +2767,7 @@ struct MDFunctor
inline
MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
MDFunctor( MDRange const& range, Functor const& f )
: m_range( range )
, m_func( f )
{}
@ -2290,7 +2784,6 @@ struct MDFunctor
inline
MDFunctor& operator=( MDFunctor && ) = default;
// KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
inline
void operator()(index_type t, value_type & v) const
{
@ -2301,6 +2794,56 @@ struct MDFunctor
Functor m_func;
};
// ParallelReduce - array reductions
template < typename MDRange, typename Functor, typename ValueType >
struct MDFunctor< MDRange, Functor, ValueType[] >
{
using range_policy = MDRange;
using functor_type = Functor;
using value_type = ValueType[];
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
>;
inline
MDFunctor( MDRange const& range, Functor const& f )
: m_range( range )
, m_func( f )
, value_count( f.value_count )
{}
inline
MDFunctor( MDFunctor const& ) = default;
inline
MDFunctor& operator=( MDFunctor const& ) = default;
inline
MDFunctor( MDFunctor && ) = default;
inline
MDFunctor& operator=( MDFunctor && ) = default;
// FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
// Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
inline
void operator()(index_type t, value_type v) const
{
iterate_type(m_range, m_func, v)(t);
}
MDRange m_range;
Functor m_func;
size_t value_count;
};
// ParallelFor
template < typename MDRange, typename Functor >
struct MDFunctor< MDRange, Functor, void >
@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void >
} } } //end namespace Kokkos::Experimental::Impl
#endif

View File

@ -55,16 +55,19 @@ template < typename ExecutionSpace = void
, typename WorkTag = void
, typename IndexType = void
, typename IterationPattern = void
, typename LaunchBounds = void
>
struct PolicyTraitsBase
{
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType,
IterationPattern, LaunchBounds>;
using execution_space = ExecutionSpace;
using schedule_type = Schedule;
using work_tag = WorkTag;
using index_type = IndexType;
using iteration_pattern = IterationPattern;
using launch_bounds = LaunchBounds;
};
@ -78,6 +81,7 @@ struct SetExecutionSpace
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
, typename PolicyBase::launch_bounds
>;
};
@ -91,6 +95,7 @@ struct SetSchedule
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
, typename PolicyBase::launch_bounds
>;
};
@ -104,6 +109,7 @@ struct SetWorkTag
, WorkTag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
, typename PolicyBase::launch_bounds
>;
};
@ -117,6 +123,7 @@ struct SetIndexType
, typename PolicyBase::work_tag
, IndexType
, typename PolicyBase::iteration_pattern
, typename PolicyBase::launch_bounds
>;
};
@ -131,6 +138,22 @@ struct SetIterationPattern
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, IterationPattern
, typename PolicyBase::launch_bounds
>;
};
template <typename PolicyBase, typename LaunchBounds>
struct SetLaunchBounds
{
static_assert( is_void<typename PolicyBase::launch_bounds>::value
, "Kokkos Error: More than one launch_bounds given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
, LaunchBounds
>;
};
@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
, typename std::conditional< is_launch_bounds<T>::value , SetLaunchBounds<Base,T>
, SetWorkTag<Base,T>
>::type >::type >::type >::type>::type::type
>::type >::type >::type >::type >::type>::type::type
, Traits...
>
{};
@ -178,11 +202,18 @@ struct AnalyzePolicy<Base>
, void // TODO set default iteration pattern
, typename Base::iteration_pattern
>::type;
using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
, LaunchBounds<>
, typename Base::launch_bounds
>::type;
using type = PolicyTraitsBase< execution_space
, schedule_type
, work_tag
, index_type
, iteration_pattern
, launch_bounds
>;
};

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
inline
int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
{ return __sync_val_compare_and_swap(dest,compare,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_val_compare_and_swap(dest,compare,val);
}
inline
long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
{ return __sync_val_compare_and_swap(dest,compare,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_val_compare_and_swap(dest,compare,val);
}
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
return tmp.t ;
}
@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
return tmp.t ;
}
@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
return tmp.t ;
}
@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
#endif
, const T >::type& val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
if( return_val == compare ) {

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
#define KOKKOS_ATOMIC_DECREMENT_HPP
@ -54,6 +58,10 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<char>(volatile char* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock decb %0"
: /* no output registers */
@ -69,6 +77,10 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<short>(volatile short* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock decw %0"
: /* no output registers */
@ -84,6 +96,10 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<int>(volatile int* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock decl %0"
: /* no output registers */
@ -99,6 +115,9 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<long long int>(volatile long long int* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock decq %0"
: /* no output registers */

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
#define KOKKOS_ATOMIC_EXCHANGE_HPP
@ -81,6 +85,10 @@ T atomic_exchange(
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
{
// int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
return *((T*)&tmp);
}
@ -93,6 +101,11 @@ T atomic_exchange(
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
{
typedef unsigned long long int type ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
// type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
return *((T*)&tmp);
@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest ,
{
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
int done = 0;
unsigned int active = __ballot(1);
unsigned int done_active = 0;
@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest ,
, const T & >::type val )
{
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
const type v = *((type*)&val); // Extract to be sure the value doesn't change
@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
, const T & >::type val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
union U {
Impl::cas128_t i ;
T t ;
@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest ,
{
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
const type v = *((type*)&val); // Extract to be sure the value doesn't change
type assumed ;
@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
, const T & >::type val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
union U {
Impl::cas128_t i ;
T t ;

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest ,
inline
int atomic_fetch_add( volatile int * dest , const int val )
{
int original = val;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock xadd %1, %0"
: "+m" (*dest), "+r" (original)
: "m" (*dest), "r" (original)
: "memory"
int original = val;
__asm__ __volatile__(
"lock xadd %1, %0"
: "+m" (*dest), "+r" (original)
: "m" (*dest), "r" (original)
: "memory"
);
return original;
return original;
}
#else
inline
int atomic_fetch_add( volatile int * const dest , const int val )
{ return __sync_fetch_and_add(dest, val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_add(dest, val);
}
#endif
inline
long int atomic_fetch_add( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_add(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_add(dest,val);
}
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
inline
unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_add(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_add(dest,val);
}
inline
unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_add(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_add(dest,val);
}
#endif
@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest ,
inline U() {};
} assume , oldval , newval ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
oldval.t = *dest ;
do {
@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest ,
inline U() {};
} assume , oldval , newval ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
oldval.t = *dest ;
do {
@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest ,
inline U() {};
} assume , oldval , newval ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
oldval.t = *dest ;
do {

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
#define KOKKOS_ATOMIC_FETCH_AND_HPP
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const
inline
int atomic_fetch_and( volatile int * const dest , const int val )
{ return __sync_fetch_and_and(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_and(dest,val);
}
inline
long int atomic_fetch_and( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_and(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_and(dest,val);
}
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
inline
unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_and(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_and(dest,val);
}
inline
unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_and(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_and(dest,val);
}
#endif

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
#define KOKKOS_ATOMIC_FETCH_OR_HPP
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const
inline
int atomic_fetch_or( volatile int * const dest , const int val )
{ return __sync_fetch_and_or(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_or(dest,val);
}
inline
long int atomic_fetch_or( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_or(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_or(dest,val);
}
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
inline
unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_or(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_or(dest,val);
}
inline
unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_or(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_or(dest,val);
}
#endif

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest ,
inline
int atomic_fetch_sub( volatile int * const dest , const int val )
{ return __sync_fetch_and_sub(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_sub(dest,val);
}
inline
long int atomic_fetch_sub( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_sub(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_sub(dest,val);
}
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
inline
unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_sub(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_sub(dest,val);
}
inline
unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_sub(dest,val); }
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
return __sync_fetch_and_sub(dest,val);
}
#endif
@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest ,
{
union { int i ; T t ; } assume , oldval , newval ;
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
oldval.t = *dest ;
do {
@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(long) , const T >::type val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
union { long i ; T t ; } assume , oldval , newval ;
oldval.t = *dest ;
@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest ,
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
#endif
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
*dest = return_val - val;

View File

@ -41,6 +41,10 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
#include <xmmintrin.h>
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
#define KOKKOS_ATOMIC_INCREMENT_HPP
@ -52,6 +56,9 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<char>(volatile char* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock incb %0"
: /* no output registers */
@ -67,6 +74,9 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<short>(volatile short* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock incw %0"
: /* no output registers */
@ -82,6 +92,9 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<int>(volatile int* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock incl %0"
: /* no output registers */
@ -97,6 +110,9 @@ template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<long long int>(volatile long long int* a) {
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
#endif
__asm__ __volatile__(
"lock incq %0"
: /* no output registers */

View File

@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
#if defined( KOKKOS_ENABLE_OPENMP )
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
if(num_threads>0) {
if(use_numa>0) {
Kokkos::OpenMP::initialize(num_threads,use_numa);
}
else {
Kokkos::OpenMP::initialize(num_threads);
}
} else {
Kokkos::OpenMP::initialize();
if(use_numa>0) {
Kokkos::OpenMP::initialize(num_threads,use_numa);
}
else {
Kokkos::OpenMP::initialize(num_threads);
}
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
}
else {
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[])
iarg++;
}
InitArguments arguments;
arguments.num_threads = num_threads;
arguments.num_numa = numa;
arguments.device_id = device;
InitArguments arguments{num_threads, numa, device};
Impl::initialize_internal(arguments);
}

View File

@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType:
static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
"Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
/* this cast to bool is needed for correctness by NVCC */
enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
// If not an array then what is the sizeof(value_type)
enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
typedef value_type * pointer_type ;
// The reference_type for an array is 'value_type *'
// The reference_type for a single value is 'value_type &'
typedef typename Impl::if_c< ! StaticValueSize , value_type *
, value_type & >::type reference_type ;
typedef typename Impl::if_c< IsArray , value_type *
, value_type & >::type reference_type ;
// Number of values if single value
template< class F >
KOKKOS_FORCEINLINE_FUNCTION static
typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
value_count( const F & ) { return 1 ; }
// Number of values if an array, protect via templating because 'f.value_count'
// will only exist when the functor declares the value_type to be an array.
template< class F >
KOKKOS_FORCEINLINE_FUNCTION static
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
value_count( const F & f ) { return f.value_count ; }
// Total size of the value

View File

@ -70,62 +70,6 @@
#ifdef KOKKOS_ENABLE_HBWSPACE
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
namespace Kokkos {
namespace Experimental {
namespace {
static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
typedef int (* QuerySpaceInParallelPtr )();
QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
int s_in_parallel_query_count = 0 ;
} // namespace <empty>
void HBWSpace::register_in_parallel( int (*device_in_parallel)() )
{
if ( 0 == device_in_parallel ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) );
}
int i = -1 ;
if ( ! (device_in_parallel)() ) {
for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
}
if ( i < s_in_parallel_query_count ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) );
}
if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
}
for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
if ( i == s_in_parallel_query_count ) {
s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
}
}
int HBWSpace::in_parallel()
{
const int n = s_in_parallel_query_count ;
int i = 0 ;
while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
return i < n ;
}
} // namespace Experiemtal
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {

View File

@ -106,62 +106,6 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace {
static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
typedef int (* QuerySpaceInParallelPtr )();
QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
int s_in_parallel_query_count = 0 ;
} // namespace <empty>
void HostSpace::register_in_parallel( int (*device_in_parallel)() )
{
if ( 0 == device_in_parallel ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
}
int i = -1 ;
if ( ! (device_in_parallel)() ) {
for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
}
if ( i < s_in_parallel_query_count ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
}
if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
}
for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
if ( i == s_in_parallel_query_count ) {
s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
}
}
int HostSpace::in_parallel()
{
const int n = s_in_parallel_query_count ;
int i = 0 ;
while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
return i < n ;
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/* Default allocation mechanism */
@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
}
}
constexpr const char* HostSpace::name() {
return m_name;
}
} // namespace Kokkos
//----------------------------------------------------------------------------

View File

@ -45,7 +45,7 @@
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_Spinwait.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool
{
bool ok = true ;
memory_fence();
// Verify not already a member of a pool:
for ( int rank = 0 ; rank < size && ok ; ++rank ) {
ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
}
if ( ok ) {
@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool
mem->m_team_alloc = 1 ;
mem->m_league_rank = rank ;
mem->m_league_size = size ;
mem->m_pool_rendezvous_step = 0 ;
mem->m_team_rendezvous_step = 0 ;
pool[ rank ] = mem ;
}
@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool()
m_team_alloc = 1 ;
m_league_rank = 0 ;
m_league_size = 1 ;
m_pool_rendezvous_step = 0 ;
m_team_rendezvous_step = 0 ;
}
@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
union {
int64_t full ;
int8_t byte[8] ;
} value ;
if ( rank ) {
const int group_begin = rank << shift_byte ; // == rank * size_byte
@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
const int end = group_begin + size_byte < size
? size_byte : size - group_begin ;
value.full = 0 ;
for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
int64_t value = 0 ;
store_fence(); // This should not be needed but fixes #742
for ( int i = 0 ; i < end ; ++i ) {
((int8_t*) & value )[i] = int8_t( step );
}
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value.full );
, value );
}
{
@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
const int end = size_byte < size ? 8 : size ;
value.full = 0 ;
for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
int64_t value = 0 ;
for ( int i = 1 ; i < end ; ++i ) {
((int8_t *) & value)[i] = int8_t( step );
}
spinwait_until_equal( buffer[ sync_offset ], value.full );
spinwait_until_equal( buffer[ sync_offset ], value );
}
return rank ? 0 : 1 ;

View File

@ -50,6 +50,7 @@
#include <Kokkos_ExecPolicy.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_FunctorAnalysis.hpp>
#include <impl/Kokkos_Rendezvous.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -67,14 +68,12 @@ public:
// Assume upper bounds on number of threads:
// pool size <= 1024 threads
// pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
// team size <= 64 threads
// team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36
enum : int { max_pool_members = 1024 };
enum : int { max_team_members = 64 };
enum : int { max_pool_rendezvous = ( max_pool_members / 8 ) * 4 + 4 };
enum : int { max_team_rendezvous = ( max_team_members / 8 ) * 4 + 4 };
enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) };
enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) };
private:
@ -114,7 +113,6 @@ private:
int m_league_size ;
int m_work_chunk ;
int m_steal_rank ; // work stealing rank
int mutable m_pool_rendezvous_step ;
int mutable m_team_rendezvous_step ;
HostThreadTeamData * team_member( int r ) const noexcept
@ -147,6 +145,7 @@ public:
int team_rendezvous( int const root ) const noexcept
{
return 1 == m_team_size ? 1 :
HostThreadTeamData::
rendezvous( m_team_scratch + m_team_rendezvous
, m_team_rendezvous_step
, m_team_size
@ -157,6 +156,7 @@ public:
int team_rendezvous() const noexcept
{
return 1 == m_team_size ? 1 :
HostThreadTeamData::
rendezvous( m_team_scratch + m_team_rendezvous
, m_team_rendezvous_step
, m_team_size
@ -167,6 +167,7 @@ public:
void team_rendezvous_release() const noexcept
{
if ( 1 < m_team_size ) {
HostThreadTeamData::
rendezvous_release( m_team_scratch + m_team_rendezvous
, m_team_rendezvous_step );
}
@ -175,19 +176,30 @@ public:
inline
int pool_rendezvous() const noexcept
{
static constexpr int yield_wait =
#if defined( KOKKOS_COMPILER_IBM )
// If running on IBM POWER architecture the global
// level rendzvous should immediately yield when
// waiting for other threads in the pool to arrive.
1
#else
0
#endif
;
return 1 == m_pool_size ? 1 :
Kokkos::Impl::
rendezvous( m_pool_scratch + m_pool_rendezvous
, m_pool_rendezvous_step
, m_pool_size
, m_pool_rank );
, m_pool_rank
, yield_wait );
}
inline
void pool_rendezvous_release() const noexcept
{
if ( 1 < m_pool_size ) {
rendezvous_release( m_pool_scratch + m_pool_rendezvous
, m_pool_rendezvous_step );
Kokkos::Impl::
rendezvous_release( m_pool_scratch + m_pool_rendezvous );
}
}
@ -213,7 +225,6 @@ public:
, m_league_size(1)
, m_work_chunk(0)
, m_steal_rank(0)
, m_pool_rendezvous_step(0)
, m_team_rendezvous_step(0)
{}
@ -406,7 +417,7 @@ fflush(stdout);
// Steal from next team, round robin
// The next team is offset by m_team_alloc if it fits in the pool.
m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
m_team_base + m_team_alloc : 0 ;
}

View File

@ -50,51 +50,70 @@
namespace Kokkos {
namespace Profiling {
static initFunction initProfileLibrary = nullptr;
static finalizeFunction finalizeProfileLibrary = nullptr;
static beginFunction beginForCallee = nullptr;
static beginFunction beginScanCallee = nullptr;
static beginFunction beginReduceCallee = nullptr;
static endFunction endForCallee = nullptr;
static endFunction endScanCallee = nullptr;
static endFunction endReduceCallee = nullptr;
static pushFunction pushRegionCallee = nullptr;
static popFunction popRegionCallee = nullptr;
static allocateDataFunction allocateDataCallee = nullptr;
static deallocateDataFunction deallocateDataCallee = nullptr;
static beginDeepCopyFunction beginDeepCopyCallee = nullptr;
static endDeepCopyFunction endDeepCopyCallee = nullptr;
SpaceHandle::SpaceHandle(const char* space_name) {
strncpy(name,space_name,64);
}
bool profileLibraryLoaded() {
return (NULL != initProfileLibrary);
return (nullptr != initProfileLibrary);
}
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginForCallee) {
if(nullptr != beginForCallee) {
Kokkos::fence();
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelFor(const uint64_t kernelID) {
if(NULL != endForCallee) {
if(nullptr != endForCallee) {
Kokkos::fence();
(*endForCallee)(kernelID);
}
}
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginScanCallee) {
if(nullptr != beginScanCallee) {
Kokkos::fence();
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelScan(const uint64_t kernelID) {
if(NULL != endScanCallee) {
if(nullptr != endScanCallee) {
Kokkos::fence();
(*endScanCallee)(kernelID);
}
}
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginReduceCallee) {
if(nullptr != beginReduceCallee) {
Kokkos::fence();
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelReduce(const uint64_t kernelID) {
if(NULL != endReduceCallee) {
if(nullptr != endReduceCallee) {
Kokkos::fence();
(*endReduceCallee)(kernelID);
}
@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) {
void pushRegion(const std::string& kName) {
if( NULL != pushRegionCallee ) {
if( nullptr != pushRegionCallee ) {
Kokkos::fence();
(*pushRegionCallee)(kName.c_str());
}
}
void popRegion() {
if( NULL != popRegionCallee ) {
if( nullptr != popRegionCallee ) {
Kokkos::fence();
(*popRegionCallee)();
}
}
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
if(nullptr != allocateDataCallee) {
(*allocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
if(nullptr != deallocateDataCallee) {
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
const uint64_t size) {
if(nullptr != beginDeepCopyCallee) {
(*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr,
src_space, src_label.c_str(), src_ptr,
size);
}
}
void endDeepCopy() {
if(nullptr != endDeepCopyCallee) {
(*endDeepCopyCallee)();
}
}
void initialize() {
// Make sure initialize calls happens only once
@ -140,7 +175,7 @@ void initialize() {
// If we do not find a profiling library in the environment then exit
// early.
if( NULL == envProfileLibrary ) {
if( nullptr == envProfileLibrary ) {
return ;
}
@ -149,10 +184,10 @@ void initialize() {
char* profileLibraryName = strtok(envProfileCopy, ";");
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
if(NULL == firstProfileLibrary) {
if(nullptr == firstProfileLibrary) {
std::cerr << "Error: Unable to load KokkosP library: " <<
profileLibraryName << std::endl;
} else {
@ -191,14 +226,19 @@ void initialize() {
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
deallocateDataCallee = *((deallocateDataFunction*) &p12);
auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13);
auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
endDeepCopyCallee = *((endDeepCopyFunction*) &p14);
}
}
if(NULL != initProfileLibrary) {
if(nullptr != initProfileLibrary) {
(*initProfileLibrary)(0,
(uint64_t) KOKKOSP_INTERFACE_VERSION,
(uint32_t) 0,
NULL);
nullptr);
}
free(envProfileCopy);
@ -210,28 +250,30 @@ void finalize() {
if(is_finalized) return;
is_finalized = 1;
if(NULL != finalizeProfileLibrary) {
if(nullptr != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
// Set all profile hooks to NULL to prevent
// Set all profile hooks to nullptr to prevent
// any additional calls. Once we are told to
// finalize, we mean it
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
initProfileLibrary = nullptr;
finalizeProfileLibrary = nullptr;
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
beginForCallee = nullptr;
beginScanCallee = nullptr;
beginReduceCallee = nullptr;
endScanCallee = nullptr;
endForCallee = nullptr;
endReduceCallee = nullptr;
pushRegionCallee = NULL;
popRegionCallee = NULL;
pushRegionCallee = nullptr;
popRegionCallee = nullptr;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
allocateDataCallee = nullptr;
deallocateDataCallee = nullptr;
beginDeepCopyCallee = nullptr;
endDeepCopyCallee = nullptr;
}
}
}

View File

@ -81,23 +81,11 @@ typedef void (*popFunction)();
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
static initFunction initProfileLibrary = NULL;
static finalizeFunction finalizeProfileLibrary = NULL;
static beginFunction beginForCallee = NULL;
static beginFunction beginScanCallee = NULL;
static beginFunction beginReduceCallee = NULL;
static endFunction endForCallee = NULL;
static endFunction endScanCallee = NULL;
static endFunction endReduceCallee = NULL;
static pushFunction pushRegionCallee = NULL;
static popFunction popRegionCallee = NULL;
static allocateDataFunction allocateDataCallee = NULL;
static deallocateDataFunction deallocateDataCallee = NULL;
typedef void (*beginDeepCopyFunction)(
SpaceHandle, const char*, const void*,
SpaceHandle, const char*, const void*,
uint64_t);
typedef void (*endDeepCopyFunction)();
bool profileLibraryLoaded();
@ -114,35 +102,14 @@ void popRegion();
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
const uint64_t size);
void endDeepCopy();
void initialize();
void finalize();
//Define finalize_fake inline to get rid of warnings for unused static variables
inline void finalize_fake() {
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
// Set all profile hooks to NULL to prevent
// any additional calls. Once we are told to
// finalize, we mean it
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
pushRegionCallee = NULL;
popRegionCallee = NULL;
}
}
}
}

View File

@ -0,0 +1,208 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Rendezvous.hpp>
#include <impl/Kokkos_Spinwait.hpp>
namespace Kokkos { namespace Impl {
//----------------------------------------------------------------------------
/* pattern for rendezvous
*
* if ( rendezvous() ) {
* ... all other threads are still in team_rendezvous() ...
* rendezvous_release();
* ... all other threads are released from team_rendezvous() ...
* }
*/
int rendezvous( volatile int64_t * const buffer
, int const size
, int const rank
, int const slow
) noexcept
{
enum : int { shift_byte = 3 };
enum : int { size_byte = ( 01 << shift_byte ) }; // == 8
enum : int { mask_byte = size_byte - 1 };
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
// Cycle step values: 1 <= step <= size_val_cycle
// An odd multiple of memory cycle so that when a memory location
// is reused it has a different value.
// Must be representable within a single byte: size_val_cycle < 16
enum : int { size_val_cycle = 3 * size_mem_cycle };
// Requires:
// Called by rank = [ 0 .. size )
// buffer aligned to int64_t[4]
// A sequence of rendezvous uses four cycled locations in memory
// and non-equal cycled synchronization values to
// 1) prevent rendezvous from overtaking one another and
// 2) give each spin wait location an int64_t[4] span
// so that it has its own cache line.
const int64_t step = (buffer[0] % size_val_cycle ) + 1 ;
// The leading int64_t[4] span is for thread 0 to write
// and all other threads to read spin-wait.
// sync_offset is the index into this array for this step.
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ;
if ( rank ) {
const int group_begin = rank << shift_byte ; // == rank * size_byte
if ( group_begin < size ) {
// This thread waits for threads
// [ group_begin .. group_begin + 8 )
// [ rank*8 .. rank*8 + 8 )
// to write to their designated bytes.
const int end = group_begin + size_byte < size
? size_byte : size - group_begin ;
int64_t value = 0;
for ( int i = 0 ; i < end ; ++i ) {
value |= step << (i * size_byte );
}
store_fence(); // This should not be needed but fixes #742
if ( slow ) {
yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value );
}
else {
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value );
}
}
{
// This thread sets its designated byte.
// ( rank % size_byte ) +
// ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
// ( sync_offset * size_byte )
const int offset = ( rank & mask_byte )
+ ( ( rank & ~mask_byte ) << shift_mem_cycle )
+ ( sync_offset << shift_byte );
// All of this thread's previous memory stores must be complete before
// this thread stores the step value at this thread's designated byte
// in the shared synchronization array.
Kokkos::memory_fence();
((volatile int8_t*) buffer)[ offset ] = int8_t( step );
// Memory fence to push the previous store out
Kokkos::memory_fence();
}
// Wait for thread 0 to release all other threads
if ( slow ) {
yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
}
else {
spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
}
}
else {
// Thread 0 waits for threads [1..7]
// to write to their designated bytes.
const int end = size_byte < size ? 8 : size ;
int64_t value = 0;
for ( int i = 1 ; i < end ; ++i ) {
value |= step << (i * size_byte );
}
if ( slow ) {
yield_until_equal( buffer[ sync_offset ], value );
}
else {
spinwait_until_equal( buffer[ sync_offset ], value );
}
}
return rank ? 0 : 1 ;
}
void rendezvous_release( volatile int64_t * const buffer ) noexcept
{
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
enum : int { size_val_cycle = 3 * size_mem_cycle };
// Requires:
// Called after team_rendezvous
// Called only by true == team_rendezvous(root)
// update step
const int64_t step = (buffer[0] % size_val_cycle ) + 1;
buffer[0] = step;
// Memory fence to be sure all previous writes are complete:
Kokkos::memory_fence();
buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step;
// Memory fence to push the store out
Kokkos::memory_fence();
}
}} // namespace Kokkos::Impl

View File

@ -0,0 +1,87 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP
#define KOKKOS_IMPL_RENDEZVOUS_HPP
#include <cstdint>
namespace Kokkos { namespace Impl {
inline
constexpr int rendezvous_buffer_size( int max_members ) noexcept
{
return (((max_members + 7) / 8) * 4) + 4 + 4;
}
/** \brief Thread pool rendezvous
*
* Rendezvous pattern:
* if ( rendezvous(root) ) {
* ... only root thread here while all others wait ...
* rendezvous_release();
* }
* else {
* ... all other threads release here ...
* }
*
* Requires: buffer[ rendezvous_buffer_size( max_threads ) ];
*
* When slow != 0 the expectation is thread arrival will be
* slow so the threads that arrive early should quickly yield
* their core to the runtime thus possibly allowing the late
* arriving threads to have more resources
* (e.g., power and clock frequency).
*/
int rendezvous( volatile int64_t * const buffer
, int const size
, int const rank
, int const slow = 0 ) noexcept ;
void rendezvous_release( volatile int64_t * const buffer ) noexcept ;
}} // namespace Kokkos::Impl
#endif // KOKKOS_IMPL_RENDEZVOUS_HPP

View File

@ -50,6 +50,7 @@
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_SharedAlloc.hpp>
/*--------------------------------------------------------------------------*/
@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes
}
}
// Get thread team data structure for omp_get_thread_num()
HostThreadTeamData * serial_get_thread_team_data()
{
return & g_serial_thread_team_data ;
@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count
(void) use_cores_per_numa;
(void) allow_asynchronous_threadpool;
Impl::SharedAllocationRecord< void, void >::tracking_enable();
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if defined(KOKKOS_ENABLE_PROFILING)

View File

@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
{
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
{
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using task_root_type = TaskBase< void , void , void > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;

View File

@ -65,7 +65,7 @@ public:
using execution_space = Kokkos::Serial ;
using memory_space = Kokkos::HostSpace ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ;
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
static

View File

@ -0,0 +1,102 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::Serial
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Serial,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
public:
inline
void execute()
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
}
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */

View File

@ -46,23 +46,23 @@
namespace Kokkos {
namespace Impl {
int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
namespace {
void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
{
// A host thread claim and disable tracking flag
__thread int t_tracking_enabled = 1;
while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
}
void SharedAllocationRecord< void , void >::tracking_release_and_enable()
{
// The host thread that claimed and disabled the tracking flag
// now release and enable tracking.
int SharedAllocationRecord< void , void >::tracking_enabled()
{ return t_tracking_enabled; }
if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
}
void SharedAllocationRecord< void , void >::tracking_disable()
{
t_tracking_enabled = 0;
}
void SharedAllocationRecord< void , void >::tracking_enable()
{
t_tracking_enabled = 1;
}
//----------------------------------------------------------------------------

View File

@ -71,6 +71,9 @@ public:
KOKKOS_INLINE_FUNCTION static
const SharedAllocationHeader * get_header( void * alloc_ptr )
{ return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
KOKKOS_INLINE_FUNCTION
const char* label() const { return m_label; }
};
template<>
@ -83,8 +86,6 @@ protected:
typedef void (* function_type )( SharedAllocationRecord<void,void> * );
static int s_tracking_enabled ;
SharedAllocationHeader * const m_alloc_ptr ;
size_t const m_alloc_size ;
function_type const m_dealloc ;
@ -110,17 +111,17 @@ protected:
public:
inline std::string get_label() const { return std::string("Unmanaged"); }
static int tracking_enabled() { return s_tracking_enabled ; }
static int tracking_enabled();
/**\brief A host process thread claims and disables the
* shared allocation tracking flag.
*/
static void tracking_claim_and_disable();
static void tracking_disable();
/**\brief A host process thread releases and enables the
* shared allocation tracking flag.
*/
static void tracking_release_and_enable();
static void tracking_enable();
~SharedAllocationRecord() = default ;
@ -317,6 +318,11 @@ public:
#endif
}
KOKKOS_INLINE_FUNCTION
bool has_record() const {
return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
}
KOKKOS_FORCEINLINE_FUNCTION
~SharedAllocationTracker()
{ KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }

View File

@ -0,0 +1,210 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_BitOps.hpp>
#if defined( KOKKOS_ENABLE_STDTHREAD )
#include <thread>
#elif !defined( _WIN32 )
#include <sched.h>
#include <time.h>
#else
#include <process.h>
#include <winsock2.h>
#include <windows.h>
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace {
void host_thread_yield( const uint32_t i , const int force_yield )
{
static constexpr uint32_t sleep_limit = 1 << 13 ;
static constexpr uint32_t yield_limit = 1 << 12 ;
const int c = Kokkos::Impl::bit_scan_reverse(i);
if ( sleep_limit < i ) {
// Attempt to put the thread to sleep for 'c' milliseconds
#if defined( KOKKOS_ENABLE_STDTHREAD )
std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
#elif !defined( _WIN32 )
timespec req ;
req.tv_sec = 0 ;
req.tv_nsec = 1000 * c ;
nanosleep( &req, nullptr );
#else /* defined( _WIN32 ) IS Microsoft Windows */
Sleep(c);
#endif
}
else if ( force_yield || yield_limit < i ) {
// Attempt to yield thread resources to runtime
#if defined( KOKKOS_ENABLE_STDTHREAD )
std::this_thread::yield();
#elif !defined( _WIN32 )
sched_yield();
#else /* defined( _WIN32 ) IS Microsoft Windows */
YieldProcessor();
#endif
}
#if defined( KOKKOS_ENABLE_ASM )
else if ( (1u<<4) < i ) {
// Insert a few no-ops to quiet the thread:
for ( int k = 0 ; k < c ; ++k ) {
#if defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ )
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
asm volatile( "nop\n" );
#else
__asm__ __volatile__( "nop\n" );
#endif
#elif defined(__PPC64__)
asm volatile( "nop\n" );
#endif
}
}
{
// Insert memory pause
#if defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ )
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
asm volatile( "pause\n":::"memory" );
#else
__asm__ __volatile__( "pause\n":::"memory" );
#endif
#elif defined(__PPC64__)
asm volatile( "or 27, 27, 27" ::: "memory" );
#endif
}
#endif /* defined( KOKKOS_ENABLE_ASM ) */
}
}}} // namespace Kokkos::Impl::{anonymous}
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void yield_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
} /* namespace Impl */
} /* namespace Kokkos */
#else
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
#endif

View File

@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
void yield_while_equal( volatile int32_t & flag , const int32_t value );
void yield_until_equal( volatile int32_t & flag , const int32_t value );
void yield_while_equal( volatile int64_t & flag , const int64_t value );
void yield_until_equal( volatile int64_t & flag , const int64_t value );
#else
KOKKOS_INLINE_FUNCTION
@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int64_t & , const int64_t ) {}
#endif
} /* namespace Impl */

View File

@ -59,24 +59,15 @@
namespace Kokkos {
namespace Impl {
/*\brief Implementation data for task data management, access, and execution.
*
* Curiously recurring template pattern (CRTP)
* to allow static_cast from the
* task root type and a task's FunctorType.
*
* TaskBase< Space , ResultType , FunctorType >
* : TaskBase< Space , ResultType , void >
* , FunctorType
* { ... };
*
* TaskBase< Space , ResultType , void >
* : TaskBase< Space , void , void >
* { ... };
*/
template< typename Space , typename ResultType , typename FunctorType >
template< class Space , typename ResultType , class FunctorType >
class TaskBase ;
template< typename Space >
class TaskQueue ;
template< typename Space >
class TaskQueueSpecialization ;
} /* namespace Impl */
} /* namespace Kokkos */
@ -86,8 +77,217 @@ class TaskBase ;
namespace Kokkos {
namespace Impl {
template< typename Space >
class TaskQueueSpecialization ;
/** \brief Base class for task management, access, and execution.
*
* Inheritance structure to allow static_cast from the task root type
* and a task's FunctorType.
*
* // Enable a functor to access the base class
* // and provide memory for result value.
* TaskBase< Space , ResultType , FunctorType >
* : TaskBase< void , void , void >
* , FunctorType
* { ... };
* Followed by memory allocated for result value.
*
*
* States of a task:
*
* Constructing State, NOT IN a linked list
* m_wait == 0
* m_next == 0
*
* Scheduling transition : Constructing -> Waiting
* before:
* m_wait == 0
* m_next == this task's initial dependence, 0 if none
* after:
* m_wait == EndTag
* m_next == EndTag
*
* Waiting State, IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == next of linked list of tasks
*
* transition : Waiting -> Executing
* before:
* m_next == EndTag
* after::
* m_next == LockTag
*
* Executing State, NOT IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == LockTag
*
* Respawn transition : Executing -> Executing-Respawn
* before:
* m_next == LockTag
* after:
* m_next == this task's updated dependence, 0 if none
*
* Executing-Respawn State, NOT IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == this task's updated dependence, 0 if none
*
* transition : Executing -> Complete
* before:
* m_wait == head of linked list
* after:
* m_wait == LockTag
*
* Complete State, NOT IN a linked list
* m_wait == LockTag: cannot add dependence (<=> complete)
* m_next == LockTag: not a member of a wait queue
*
*/
template<>
class TaskBase< void , void , void >
{
public:
enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
template< typename > friend class Kokkos::TaskScheduler ;
typedef TaskQueue< void > queue_type ;
typedef void (* function_type) ( TaskBase * , void * );
// sizeof(TaskBase) == 48
function_type m_apply ; ///< Apply function pointer
queue_type * m_queue ; ///< Pointer to queue
TaskBase * m_wait ; ///< Linked list of tasks waiting on this
TaskBase * m_next ; ///< Waiting linked-list next
int32_t m_ref_count ; ///< Reference count
int32_t m_alloc_size ; ///< Allocation size
int32_t m_dep_count ; ///< Aggregate's number of dependences
int16_t m_task_type ; ///< Type of task
int16_t m_priority ; ///< Priority of runnable task
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
TaskBase & operator = ( const TaskBase & ) = delete ;
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
KOKKOS_INLINE_FUNCTION constexpr
TaskBase()
: m_apply( 0 )
, m_queue( 0 )
, m_wait( 0 )
, m_next( 0 )
, m_ref_count( 0 )
, m_alloc_size( 0 )
, m_dep_count( 0 )
, m_task_type( 0 )
, m_priority( 0 )
{}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
TaskBase * volatile * aggregate_dependences() volatile
{ return reinterpret_cast<TaskBase*volatile*>( this + 1 ); }
KOKKOS_INLINE_FUNCTION
bool requested_respawn()
{
// This should only be called when a task has finished executing and is
// in the transition to either the complete or executing-respawn state.
TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
return lock != m_next;
}
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskBase* dep )
{
// Precondition: lock == m_next
TaskBase * const lock = (TaskBase *) LockTag ;
// Assign dependence to m_next. It will be processed in the subsequent
// call to schedule. Error if the dependence is reset.
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
}
if ( 0 != dep ) {
// The future may be destroyed upon returning from this call
// so increment reference count to track this assignment.
Kokkos::atomic_increment( &(dep->m_ref_count) );
}
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
int32_t reference_count() const
{ return *((int32_t volatile *)( & m_ref_count )); }
};
static_assert( sizeof(TaskBase<void,void,void>) == 48
, "Verifying expected sizeof(TaskBase<void,void,void>)" );
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< typename ResultType >
struct TaskResult {
enum : int32_t { size = sizeof(ResultType) };
using reference_type = ResultType & ;
KOKKOS_INLINE_FUNCTION static
ResultType * ptr( TaskBase<void,void,void> * task )
{
return reinterpret_cast< ResultType * >
( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) );
}
KOKKOS_INLINE_FUNCTION static
reference_type get( TaskBase<void,void,void> * task )
{ return *ptr( task ); }
};
template<>
struct TaskResult< void > {
enum : int32_t { size = 0 };
using reference_type = void ;
KOKKOS_INLINE_FUNCTION static
void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; }
KOKKOS_INLINE_FUNCTION static
reference_type get( TaskBase<void,void,void> * ) {}
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskQueue< void > {};
/** \brief Manage task allocation, deallocation, and scheduling.
*
@ -95,7 +295,7 @@ class TaskQueueSpecialization ;
* All other aspects of task management have shared implementation.
*/
template< typename ExecSpace >
class TaskQueue {
class TaskQueue : public TaskQueue<void> {
private:
friend class TaskQueueSpecialization< ExecSpace > ;
@ -106,7 +306,7 @@ private:
using memory_space = typename specialization::memory_space ;
using device_type = Kokkos::Device< execution_space , memory_space > ;
using memory_pool = Kokkos::MemoryPool< device_type > ;
using task_root_type = Kokkos::Impl::TaskBase<execution_space,void,void> ;
using task_root_type = Kokkos::Impl::TaskBase<void,void,void> ;
struct Destroy {
TaskQueue * m_queue ;
@ -198,12 +398,10 @@ public:
}
// Assign task pointer with reference counting of assigned tasks
template< typename LV , typename RV >
KOKKOS_FUNCTION static
void assign( TaskBase< execution_space,LV,void> ** const lhs
, TaskBase< execution_space,RV,void> * const rhs )
void assign( task_root_type ** const lhs
, task_root_type * const rhs )
{
using task_lhs = TaskBase< execution_space,LV,void> ;
#if 0
{
printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
@ -225,7 +423,7 @@ public:
// Force write of *lhs
*static_cast< task_lhs * volatile * >(lhs) = rhs ;
*static_cast< task_root_type * volatile * >(lhs) = rhs ;
Kokkos::memory_fence();
}
@ -238,6 +436,38 @@ public:
KOKKOS_FUNCTION
void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
//----------------------------------------
/**\brief Allocation size for a spawned task */
template< typename FunctorType >
KOKKOS_FUNCTION
size_t spawn_allocation_size() const
{
using value_type = typename FunctorType::value_type ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 };
enum : size_t { task_size = sizeof(task_type) };
enum : size_t { result_size = Impl::TaskResult< value_type >::size };
enum : size_t { alloc_size =
( ( task_size + align_mask ) & ~align_mask ) +
( ( result_size + align_mask ) & ~align_mask ) };
return m_memory.allocate_block_size( task_size );
}
/**\brief Allocation size for a when_all aggregate */
KOKKOS_FUNCTION
size_t when_all_allocation_size( int narg ) const
{
return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) );
}
};
} /* namespace Impl */
@ -249,261 +479,9 @@ public:
namespace Kokkos {
namespace Impl {
template<>
class TaskBase< void , void , void > {
public:
enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
};
/** \brief Base class for task management, access, and execution.
*
* Inheritance structure to allow static_cast from the task root type
* and a task's FunctorType.
*
* // Enable a Future to access result data
* TaskBase< Space , ResultType , void >
* : TaskBase< void , void , void >
* { ... };
*
* // Enable a functor to access the base class
* TaskBase< Space , ResultType , FunctorType >
* : TaskBase< Space , ResultType , void >
* , FunctorType
* { ... };
*
*
* States of a task:
*
* Constructing State, NOT IN a linked list
* m_wait == 0
* m_next == 0
*
* Scheduling transition : Constructing -> Waiting
* before:
* m_wait == 0
* m_next == this task's initial dependence, 0 if none
* after:
* m_wait == EndTag
* m_next == EndTag
*
* Waiting State, IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == next of linked list of tasks
*
* transition : Waiting -> Executing
* before:
* m_next == EndTag
* after::
* m_next == LockTag
*
* Executing State, NOT IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == LockTag
*
* Respawn transition : Executing -> Executing-Respawn
* before:
* m_next == LockTag
* after:
* m_next == this task's updated dependence, 0 if none
*
* Executing-Respawn State, NOT IN a linked list
* m_apply != 0
* m_queue != 0
* m_ref_count > 0
* m_wait == head of linked list of tasks waiting on this task
* m_next == this task's updated dependence, 0 if none
*
* transition : Executing -> Complete
* before:
* m_wait == head of linked list
* after:
* m_wait == LockTag
*
* Complete State, NOT IN a linked list
* m_wait == LockTag: cannot add dependence
* m_next == LockTag: not a member of a wait queue
*
*/
template< typename ExecSpace >
class TaskBase< ExecSpace , void , void >
{
public:
enum : int16_t { TaskTeam = TaskBase<void,void,void>::TaskTeam
, TaskSingle = TaskBase<void,void,void>::TaskSingle
, Aggregate = TaskBase<void,void,void>::Aggregate };
enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
, EndTag = TaskBase<void,void,void>::EndTag };
using execution_space = ExecSpace ;
using queue_type = TaskQueue< execution_space > ;
template< typename > friend class Kokkos::TaskScheduler ;
typedef void (* function_type) ( TaskBase * , void * );
// sizeof(TaskBase) == 48
function_type m_apply ; ///< Apply function pointer
queue_type * m_queue ; ///< Queue in which this task resides
TaskBase * m_wait ; ///< Linked list of tasks waiting on this
TaskBase * m_next ; ///< Waiting linked-list next
int32_t m_ref_count ; ///< Reference count
int32_t m_alloc_size ; ///< Allocation size
int32_t m_dep_count ; ///< Aggregate's number of dependences
int16_t m_task_type ; ///< Type of task
int16_t m_priority ; ///< Priority of runnable task
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
TaskBase & operator = ( const TaskBase & ) = delete ;
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for a runnable task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, TaskBase * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
) noexcept
: m_apply( arg_apply )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( arg_dependence )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( 0 )
, m_task_type( arg_task_type )
, m_priority( arg_priority )
{}
// Constructor for an aggregate task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( queue_type * arg_queue
, int arg_ref_count
, int arg_alloc_size
, int arg_dep_count
) noexcept
: m_apply( 0 )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( 0 )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( arg_dep_count )
, m_task_type( Aggregate )
, m_priority( 0 )
{}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
TaskBase ** aggregate_dependences()
{ return reinterpret_cast<TaskBase**>( this + 1 ); }
KOKKOS_INLINE_FUNCTION
bool requested_respawn()
{
// This should only be called when a task has finished executing and is
// in the transition to either the complete or executing-respawn state.
TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
return lock != m_next;
}
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskBase* dep )
{
// Precondition: lock == m_next
TaskBase * const lock = (TaskBase *) LockTag ;
// Assign dependence to m_next. It will be processed in the subsequent
// call to schedule. Error if the dependence is reset.
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
}
if ( 0 != dep ) {
// The future may be destroyed upon returning from this call
// so increment reference count to track this assignment.
Kokkos::atomic_increment( &(dep->m_ref_count) );
}
}
using get_return_type = void ;
KOKKOS_INLINE_FUNCTION
get_return_type get() const {}
};
template < typename ExecSpace , typename ResultType >
class TaskBase< ExecSpace , ResultType , void >
: public TaskBase< ExecSpace , void , void >
{
private:
using root_type = TaskBase<ExecSpace,void,void> ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
static_assert( sizeof(root_type) == 48 , "" );
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
TaskBase & operator = ( const TaskBase & ) = delete ;
public:
ResultType m_result ;
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
)
: root_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, m_result()
{}
using get_return_type = ResultType const & ;
KOKKOS_INLINE_FUNCTION
get_return_type get() const { return m_result ; }
};
template< typename ExecSpace , typename ResultType , typename FunctorType >
template< class ExecSpace , typename ResultType , class FunctorType >
class TaskBase
: public TaskBase< ExecSpace , ResultType , void >
: public TaskBase< void , void , void >
, public FunctorType
{
private:
@ -516,50 +494,31 @@ private:
public:
using root_type = TaskBase< ExecSpace , void , void > ;
using base_type = TaskBase< ExecSpace , ResultType , void > ;
using specialization = TaskQueueSpecialization< ExecSpace > ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
using member_type = typename specialization::member_type ;
using root_type = TaskBase< void , void , void > ;
using functor_type = FunctorType ;
using result_type = ResultType ;
template< typename Type >
KOKKOS_INLINE_FUNCTION static
void apply_functor
( Type * const task
, typename std::enable_if
< std::is_same< typename Type::result_type , void >::value
, member_type * const
>::type member
)
{
using fType = typename Type::functor_type ;
static_cast<fType*>(task)->operator()( *member );
}
using specialization = TaskQueueSpecialization< ExecSpace > ;
using member_type = typename specialization::member_type ;
template< typename Type >
KOKKOS_INLINE_FUNCTION static
void apply_functor
( Type * const task
, typename std::enable_if
< ! std::is_same< typename Type::result_type , void >::value
, member_type * const
>::type member
)
{
using fType = typename Type::functor_type ;
static_cast<fType*>(task)->operator()( *member , task->m_result );
}
KOKKOS_INLINE_FUNCTION
void apply_functor( member_type * const member , void * )
{ functor_type::operator()( *member ); }
template< typename T >
KOKKOS_INLINE_FUNCTION
void apply_functor( member_type * const member
, T * const result )
{ functor_type::operator()( *member , *result ); }
KOKKOS_FUNCTION static
void apply( root_type * root , void * exec )
{
TaskBase * const task = static_cast< TaskBase * >( root );
member_type * const member = reinterpret_cast< member_type * >( exec );
result_type * const result = TaskResult< result_type >::ptr( task );
TaskBase::template apply_functor( task , member );
task->apply_functor( member , result );
// Task may be serial or team.
// If team then must synchronize before querying if respawn was requested.
@ -576,26 +535,9 @@ public:
}
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
, FunctorType && arg_functor
)
: base_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, functor_type( arg_functor )
{}
KOKKOS_INLINE_FUNCTION constexpr
TaskBase( FunctorType && arg_functor )
: root_type() , functor_type( std::move(arg_functor) ) {}
KOKKOS_INLINE_FUNCTION
~TaskBase() {}

View File

@ -44,6 +44,8 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0
namespace Kokkos {
namespace Impl {
@ -100,9 +102,11 @@ KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::decrement
( TaskQueue< ExecSpace >::task_root_type * task )
{
const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
task_root_type volatile & t = *task ;
#if 0
const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1);
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
if ( 1 == count ) {
printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
, uintptr_t( task )
@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement
#endif
if ( ( 1 == count ) &&
( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
( t.m_next == (task_root_type *) task_root_type::LockTag ) ) {
// Reference count is zero and task is complete, deallocate.
task->m_queue->deallocate( task , task->m_alloc_size );
TaskQueue< ExecSpace > * const queue =
static_cast< TaskQueue< ExecSpace > * >( t.m_queue );
queue->deallocate( task , t.m_alloc_size );
}
else if ( count <= 1 ) {
Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task
// Fail the push attempt if the queue is locked;
// otherwise retry until the push succeeds.
#if 0
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
, uintptr_t(queue)
, uintptr_t(*queue)
@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * volatile * const next = & task->m_next ;
task_root_type * volatile & next = task->m_next ;
if ( zero != *next ) {
if ( zero != next ) {
Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
}
@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task
while ( lock != y ) {
*next = y ;
next = y ;
// Do not proceed until '*next' has been stored.
// Do not proceed until 'next' has been stored.
Kokkos::memory_fence();
task_root_type * const x = y ;
@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task
// Failed, replace 'task->m_next' value since 'task' remains
// not a member of a queue.
*next = zero ;
next = zero ;
// Do not proceed until '*next' has been stored.
// Do not proceed until 'next' has been stored.
Kokkos::memory_fence();
return false ;
@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task
// This thread has exclusive access to
// the queue and the popped task's m_next.
*queue = task->m_next ; task->m_next = lock ;
task_root_type * volatile & next = task->m_next ;
*queue = next ; next = lock ;
Kokkos::memory_fence();
#if 0
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
, uintptr_t(queue)
, uintptr_t(task)
@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
task_root_type volatile & t = *task ;
bool respawn = false ;
//----------------------------------------
if ( zero == task->m_wait ) {
if ( zero == t.m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
task->m_wait = end ;
t.m_wait = end ;
// Task in Waiting state
}
else if ( lock != task->m_wait ) {
else if ( lock != t.m_wait ) {
// Task in Executing state with Respawn request
// - Update dependence
// - Transition to Waiting state
@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable
// Exclusive access so don't need an atomic exchange
// task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
task_root_type * dep = task->m_next ; task->m_next = zero ;
task_root_type * dep = t.m_next ; t.m_next = zero ;
Kokkos::memory_fence();
const bool is_ready =
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
Kokkos::atomic_increment( & m_ready_count );
task_root_type * volatile * const ready_queue =
& m_ready[ task->m_priority ][ task->m_task_type ];
& m_ready[ t.m_priority ][ t.m_task_type ];
// A push_task fails if the ready queue is locked.
// A ready queue is only locked during a push or pop;
@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
task_root_type volatile & t = *task ;
//----------------------------------------
if ( zero == task->m_wait ) {
if ( zero == t.m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
task->m_wait = end ;
t.m_wait = end ;
// Task in Waiting state
}
else if ( lock == task->m_wait ) {
else if ( lock == t.m_wait ) {
// Task in Complete state
Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
}
@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate
// (1) created or
// (2) being removed from a completed task's wait list.
task_root_type ** const aggr = task->aggregate_dependences();
task_root_type * volatile * const aggr = t.aggregate_dependences();
// Assume the 'when_all' is complete until a dependence is
// found that is not complete.
bool is_complete = true ;
for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) {
--i ;
@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
// Complete the when_all 'task' to schedule other tasks
// that are waiting for the when_all 'task' to complete.
task->m_next = lock ;
t.m_next = lock ;
complete( task );
@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
#if 0
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete
fflush( stdout );
#endif
const bool runnable = task_root_type::Aggregate != task->m_task_type ;
task_root_type volatile & t = *task ;
const bool runnable = task_root_type::Aggregate != t.m_task_type ;
//----------------------------------------
if ( runnable && lock != task->m_next ) {
if ( runnable && lock != t.m_next ) {
// Is a runnable task has finished executing and requested respawn.
// Schedule the task for subsequent execution.
@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete
// Stop other tasks from adding themselves to this task's wait queue
// by locking the head of this task's wait queue.
task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock );
if ( x != (task_root_type *) lock ) {
@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete
// Have exclusive access to 'x' until it is scheduled
// Set x->m_next = zero <= no dependence, not a respawn
task_root_type * const next = x->m_next ; x->m_next = 0 ;
task_root_type volatile & vx = *x ;
if ( task_root_type::Aggregate != x->m_task_type ) {
task_root_type * const next = vx.m_next ; vx.m_next = 0 ;
Kokkos::memory_fence();
if ( task_root_type::Aggregate != vx.m_task_type ) {
schedule_runnable( x );
}
else {

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -47,7 +47,6 @@
#include <Kokkos_Array.hpp>
namespace Kokkos {
namespace Experimental {
namespace Impl {
template< class DataType , class ArrayLayout , class V , size_t N , class P >
@ -94,13 +93,12 @@ public:
typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type non_const_scalar_array_type ;
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief View mapping for non-specialized data type and standard layout */
@ -597,7 +595,7 @@ public:
}
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {};
template< typename ... P >
struct ViewCtorProp ;
// Forward declare
template< typename Specialize , typename T >
struct CommonViewAllocProp ;
/* Common value_type stored as ViewCtorProp
*/
template< typename Specialize , typename T >
struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
{
ViewCtorProp() = default ;
ViewCtorProp( const ViewCtorProp & ) = default ;
ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
using type = CommonViewAllocProp<Specialize,T> ;
ViewCtorProp( const type & arg ) : value( arg ) {}
ViewCtorProp( type && arg ) : value( arg ) {}
type value ;
};
/* std::integral_constant<unsigned,I> are dummy arguments
* that avoid duplicate base class errors
*/

View File

@ -62,7 +62,6 @@
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template< unsigned I , size_t ... Args >
@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... >
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -266,14 +265,11 @@ struct ALL_t {
}} // namespace Kokkos::Impl
namespace Kokkos {
namespace Experimental {
namespace Impl {
using Kokkos::Impl::ALL_t ;
template< class T >
struct is_integral_extent_type
{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; };
{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; };
template< class iType >
struct is_integral_extent_type< std::pair<iType,iType> >
@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime;
template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) ||
((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ||
((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value))
((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value))
) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
};
@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankD
template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) ||
((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value))
) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
@ -403,7 +399,7 @@ private:
bool set( unsigned domain_rank
, unsigned range_rank
, const ViewDimension< DimArgs ... > & dim
, const Kokkos::Experimental::Impl::ALL_t
, const Kokkos::Impl::ALL_t
, Args ... args )
{
m_begin[ domain_rank ] = 0 ;
@ -519,7 +515,7 @@ private:
, unsigned domain_rank
, unsigned range_rank
, const ViewDimension< DimArgs ... > & dim
, const Kokkos::Experimental::Impl::ALL_t
, const Kokkos::Impl::ALL_t
, Args ... args ) const
{
const int n = std::min( buf_len ,
@ -670,13 +666,12 @@ public:
{ return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Given a value type and dimension generate the View data type */
@ -814,13 +809,12 @@ public:
typedef non_const_type non_const_scalar_array_type ;
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template < class Dimension , class Layout , typename Enable = void >
@ -1228,14 +1222,14 @@ private:
// If memory alignment is a multiple of the trivial scalar size then attempt to align.
enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
KOKKOS_INLINE_FUNCTION
static constexpr size_t stride( size_t const N )
{
return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
? N + align - ( N % div_ok ) : N ;
}
{
return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
? N + align - ( N % div_ok ) : N ;
}
};
public:
@ -1707,12 +1701,12 @@ private:
// If memory alignment is a multiple of the trivial scalar size then attempt to align.
enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
KOKKOS_INLINE_FUNCTION
static constexpr size_t stride( size_t const N )
{
return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
? N + align - ( N % div_ok ) : N ;
}
};
@ -2225,13 +2219,12 @@ public:
{}
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief ViewDataHandle provides the type of the 'data handle' which the view
@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits ,
return handle_type( arg_data_ptr + offset );
}
};
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
//----------------------------------------------------------------------------
@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType >
struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
{
typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
typedef typename ExecSpace::execution_space Exec;
ExecSpace space ;
Exec space ;
ValueType * ptr ;
size_t n ;
bool destroy ;
@ -2597,6 +2590,9 @@ private:
public:
typedef void printable_label_typedef;
enum { is_managed = Traits::is_managed };
//----------------------------------------
// Domain dimensions
@ -2944,7 +2940,7 @@ public:
Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
}
dst.m_offset = dst_offset_type( src.m_offset );
dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
}
};
@ -3102,7 +3098,7 @@ public:
//----------------------------------------------------------------------------
}}} // namespace Kokkos::Experimental::Impl
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -3151,6 +3147,77 @@ void view_error_operator_bounds
view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
}
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
/* Check #3: is the View managed as determined by the MemoryTraits? */
template< class MapType,
bool is_managed = (MapType::is_managed != 0) >
struct OperatorBoundsErrorOnDevice;
template< class MapType >
struct OperatorBoundsErrorOnDevice< MapType, false > {
KOKKOS_INLINE_FUNCTION
static void run(MapType const&) {
Kokkos::abort("View bounds error");
}
};
template< class MapType >
struct OperatorBoundsErrorOnDevice< MapType, true > {
KOKKOS_INLINE_FUNCTION
static void run(MapType const& map) {
char const* const user_alloc_start = reinterpret_cast<char const*>(map.data());
char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader);
SharedAllocationHeader const* const header =
reinterpret_cast<SharedAllocationHeader const*>(header_start);
char const* const label = header->label();
enum { LEN = 128 };
char msg[LEN];
char const* const first_part = "View bounds error of view ";
char* p = msg;
char* const end = msg + LEN - 1;
for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
*p = *p2;
}
for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
*p = *p2;
}
*p = '\0';
Kokkos::abort(msg);
}
};
/* Check #2: does the ViewMapping have the printable_label_typedef defined?
See above that only the non-specialized standard-layout ViewMapping has
this defined by default.
The existence of this typedef indicates the existence of MapType::is_managed */
template< class T, class Enable = void >
struct has_printable_label_typedef : public std::false_type {};
template<class T>
struct has_printable_label_typedef<
T, typename enable_if_type<typename T::printable_label_typedef>::type>
: public std::true_type
{};
template< class MapType >
KOKKOS_INLINE_FUNCTION
void operator_bounds_error_on_device(
MapType const&,
std::false_type) {
Kokkos::abort("View bounds error");
}
template< class MapType >
KOKKOS_INLINE_FUNCTION
void operator_bounds_error_on_device(
MapType const& map,
std::true_type) {
OperatorBoundsErrorOnDevice< MapType >::run(map);
}
#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
template< class MemorySpace , class MapType , class ... Args >
KOKKOS_INLINE_FUNCTION
void view_verify_operator_bounds
@ -3166,7 +3233,17 @@ void view_verify_operator_bounds
view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
Kokkos::Impl::throw_runtime_exception(std::string(buffer));
#else
Kokkos::abort("View bounds error");
/* Check #1: is there a SharedAllocationRecord?
(we won't use it, but if its not there then there isn't
a corresponding SharedAllocationHeader containing a label).
This check should cover the case of Views that don't
have the Unmanaged trait but were initialized by pointer. */
if (tracker.has_record()) {
operator_bounds_error_on_device<MapType>(
map, has_printable_label_typedef<MapType>());
} else {
Kokkos::abort("View bounds error");
}
#endif
}
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -48,7 +48,6 @@
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
// View mapping for rank two tiled array
@ -195,11 +194,9 @@ struct ViewMapping
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
namespace Kokkos {
namespace Experimental {
template< typename T , unsigned N0 , unsigned N1 , class ... P >
KOKKOS_INLINE_FUNCTION
@ -217,7 +214,6 @@ tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> &
( src , SrcLayout() , i_tile0 , i_tile1 );
}
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------

View File

@ -1,183 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <impl/Kokkos_spinwait.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_BitOps.hpp>
/*--------------------------------------------------------------------------*/
#if !defined( _WIN32 )
#if defined( KOKKOS_ENABLE_ASM )
#if defined( __arm__ ) || defined( __aarch64__ )
/* No-operation instruction to idle the thread. */
#define KOKKOS_INTERNAL_PAUSE
#else
/* Pause instruction to prevent excess processor bus usage */
#define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory")
#endif
#define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0u: KOKKOS_INTERNAL_NOP2; break;
case 1u: KOKKOS_INTERNAL_NOP4; break;
case 2u: KOKKOS_INTERNAL_NOP8; break;
case 3u: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#else
#include <sched.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
sched_yield();
}
}
#endif
#else // defined( _WIN32 )
#if defined ( KOKKOS_ENABLE_WINTHREAD )
#include <process.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
Sleep(0);
}
}
#elif defined( _MSC_VER )
#define NOMINMAX
#include <winsock2.h>
#include <windows.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
YieldProcessor();
}
}
#else
#define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory")
#define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0: KOKKOS_INTERNAL_NOP2; break;
case 1: KOKKOS_INTERNAL_NOP4; break;
case 2: KOKKOS_INTERNAL_NOP8; break;
case 3: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#endif
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
} /* namespace Impl */
} /* namespace Kokkos */
#else
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
#endif