Updating Kokkos lib to 2.03.00

This commit is contained in:
Stan Moore
2017-04-25 13:48:51 -06:00
parent 9f6e126a2f
commit 8910ec6e59
261 changed files with 27816 additions and 17799 deletions

File diff suppressed because it is too large Load Diff

View File

@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#elif defined( __INTEL_COMPILER )
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_forward(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __cnttz4(i);
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#else
unsigned t = 1u;
int r = 0;
while ( i && ( i & t == 0 ) )
@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_reverse(i);
#elif defined( KOKKOS_COMPILER_IBM )
return shift - __cntlz4(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#elif defined( __INTEL_COMPILER )
return _bit_scan_reverse(i);
#else
unsigned t = 1u << shift;
int r = 0;
@ -101,10 +104,12 @@ int bit_count( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __popcnt4(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#else
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
i = i - ( ( i >> 1 ) & ~0u / 3u ); // temp

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
}
#endif
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
void finalize_internal( const bool all_spaces = false )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
@ -449,5 +449,323 @@ void fence()
Impl::fence_internal();
}
void print_configuration( std::ostream & out , const bool detail )
{
std::ostringstream msg;
msg << "Compiler:" << std::endl;
#ifdef KOKKOS_COMPILER_APPLECC
msg << " KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_CLANG
msg << " KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
#endif
#ifdef KOKKOS_COMPILER_CRAYC
msg << " KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_GNU
msg << " KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
#endif
#ifdef KOKKOS_COMPILER_IBM
msg << " KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
#endif
#ifdef KOKKOS_COMPILER_INTEL
msg << " KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
#endif
#ifdef KOKKOS_COMPILER_NVCC
msg << " KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_PGI
msg << " KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
#endif
msg << "Architecture:" << std::endl;
#ifdef KOKKOS_ENABLE_ISA_KNC
msg << " KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_ISA_X86_64
msg << " KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
#endif
msg << "Devices:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA: ";
#ifdef KOKKOS_ENABLE_CUDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_OPENMP: ";
#ifdef KOKKOS_ENABLE_OPENMP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PTHREAD: ";
#ifdef KOKKOS_ENABLE_PTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_STDTHREAD: ";
#ifdef KOKKOS_ENABLE_STDTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_WINTHREAD: ";
#ifdef KOKKOS_ENABLE_WINTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_QTHREADS: ";
#ifdef KOKKOS_ENABLE_QTHREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_SERIAL: ";
#ifdef KOKKOS_ENABLE_SERIAL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Default Device:" << std::endl;
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Atomics:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA_ATOMICS: ";
#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_GNU_ATOMICS: ";
#ifdef KOKKOS_ENABLE_GNU_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_INTEL_ATOMICS: ";
#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_OPENMP_ATOMICS: ";
#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Vectorization:" << std::endl;
msg << " KOKKOS_ENABLE_PRAGMA_IVDEP: ";
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_SIMD: ";
#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_UNROLL: ";
#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_VECTOR: ";
#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Memory:" << std::endl;
msg << " KOKKOS_ENABLE_HBWSPACE: ";
#ifdef KOKKOS_ENABLE_HBWSPACE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_POSIX_MEMALIGN: ";
#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Options:" << std::endl;
msg << " KOKKOS_ENABLE_ASM: ";
#ifdef KOKKOS_ENABLE_ASM
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CXX1Z: ";
#ifdef KOKKOS_ENABLE_CXX1Z
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_HWLOC: ";
#ifdef KOKKOS_ENABLE_HWLOC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_LIBRT: ";
#ifdef KOKKOS_ENABLE_LIBRT
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_MPI: ";
#ifdef KOKKOS_ENABLE_MPI
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PROFILING: ";
#ifdef KOKKOS_ENABLE_PROFILING
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_CUDA
msg << "Cuda Options:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA_LAMBDA: ";
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_UVM: ";
#ifdef KOKKOS_ENABLE_CUDA_UVM
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUSPARSE: ";
#ifdef KOKKOS_ENABLE_CUSPARSE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
#endif
msg << "\nRuntime Configuration:" << std::endl;
#ifdef KOKKOS_ENABLE_CUDA
Cuda::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_OPENMP
OpenMP::print_configuration(msg, detail);
#endif
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
Threads::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_QTHREADS
Qthreads::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_SERIAL
Serial::print_configuration(msg, detail);
#endif
out << msg.str() << std::endl;
}
} // namespace Kokkos

View File

@ -0,0 +1,653 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_FUNCTORANALYSIS_HPP
#define KOKKOS_FUNCTORANALYSIS_HPP
#include <cstddef>
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Reducer.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct FunctorPatternInterface {
struct FOR {};
struct REDUCE {};
struct SCAN {};
};
/** \brief Query Functor and execution policy argument tag for value type.
*
* If 'value_type' is not explicitly declared in the functor
* then attempt to deduce the type from FunctorType::operator()
* interface used by the pattern and policy.
*
* For the REDUCE pattern generate a Reducer and finalization function
* derived from what is available within the functor.
*/
template< typename PatternInterface , class Policy , class Functor >
struct FunctorAnalysis {
private:
using FOR = FunctorPatternInterface::FOR ;
using REDUCE = FunctorPatternInterface::REDUCE ;
using SCAN = FunctorPatternInterface::SCAN ;
//----------------------------------------
struct VOID {};
template< typename P = Policy , typename = std::false_type >
struct has_work_tag
{
using type = void ;
using wtag = VOID ;
};
template< typename P >
struct has_work_tag
< P , typename std::is_same< typename P::work_tag , void >::type >
{
using type = typename P::work_tag ;
using wtag = typename P::work_tag ;
};
using Tag = typename has_work_tag<>::type ;
using WTag = typename has_work_tag<>::wtag ;
//----------------------------------------
// Check for Functor::value_type, which is either a simple type T or T[]
template< typename F , typename = std::false_type >
struct has_value_type { using type = void ; };
template< typename F >
struct has_value_type
< F , typename std::is_same< typename F::value_type , void >::type >
{
using type = typename F::value_type ;
static_assert( ! std::is_reference< type >::value &&
std::rank< type >::value <= 1 &&
std::extent< type >::value == 0
, "Kokkos Functor::value_type is T or T[]" );
};
//----------------------------------------
// If Functor::value_type does not exist then evaluate operator(),
// depending upon the pattern and whether the policy has a work tag,
// to determine the reduction or scan value_type.
template< typename F
, typename P = PatternInterface
, typename V = typename has_value_type<F>::type
, bool T = std::is_same< Tag , void >::value
>
struct deduce_value_type { using type = V ; };
template< typename F >
struct deduce_value_type< F , REDUCE , void , true > {
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( M , A & ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , REDUCE , void , false > {
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag , M , A & ) const );
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag const & , M , A & ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , SCAN , void , true > {
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( M , A & , I ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , SCAN , void , false > {
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag , M , A & , I ) const );
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
using type = decltype( deduce( & F::operator() ) );
};
//----------------------------------------
using candidate_type = typename deduce_value_type< Functor >::type ;
enum { candidate_is_void = std::is_same< candidate_type , void >::value
, candidate_is_array = std::rank< candidate_type >::value == 1 };
//----------------------------------------
public:
using value_type = typename std::remove_extent< candidate_type >::type ;
static_assert( ! std::is_const< value_type >::value
, "Kokkos functor operator reduce argument cannot be const" );
private:
// Stub to avoid defining a type 'void &'
using ValueType = typename
std::conditional< candidate_is_void , VOID , value_type >::type ;
public:
using pointer_type = typename
std::conditional< candidate_is_void , void , ValueType * >::type ;
using reference_type = typename
std::conditional< candidate_is_array , ValueType * , typename
std::conditional< ! candidate_is_void , ValueType & , void >
::type >::type ;
private:
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< IsArray , unsigned >::type
get_length( FF const & f ) { return f.value_count ; }
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< ! IsArray , unsigned >::type
get_length( FF const & ) { return 1 ; }
public:
enum { StaticValueSize = ! candidate_is_void &&
! candidate_is_array
? sizeof(ValueType) : 0 };
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_count( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f); }
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_size( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
//----------------------------------------
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_count( const Unknown & )
{ return 1 ; }
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_size( const Unknown & )
{ return sizeof(ValueType); }
private:
enum INTERFACE : int
{ DISABLE = 0
, NO_TAG_NOT_ARRAY = 1
, NO_TAG_IS_ARRAY = 2
, HAS_TAG_NOT_ARRAY = 3
, HAS_TAG_IS_ARRAY = 4
, DEDUCED =
! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
std::is_same<Tag,void>::value
? (candidate_is_array ? NO_TAG_IS_ARRAY : NO_TAG_NOT_ARRAY)
: (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
};
//----------------------------------------
// parallel_reduce join operator
template< class F , INTERFACE >
struct has_join_function ;
template< class F >
struct has_join_function< F , NO_TAG_NOT_ARRAY >
{
typedef volatile ValueType & vref_type ;
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( *dst , *src ); }
};
template< class F >
struct has_join_function< F , NO_TAG_IS_ARRAY >
{
typedef volatile ValueType * vref_type ;
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( dst , src ); }
};
template< class F >
struct has_join_function< F , HAS_TAG_NOT_ARRAY >
{
typedef volatile ValueType & vref_type ;
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , *dst , *src ); }
};
template< class F >
struct has_join_function< F , HAS_TAG_IS_ARRAY >
{
typedef volatile ValueType * vref_type ;
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , dst , src ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceJoin
{
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{
const int n = FunctorAnalysis::value_count( f );
for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
}
};
template< class F >
struct DeduceJoin< F , DISABLE , void >
{
KOKKOS_INLINE_FUNCTION static
void join( F const &
, ValueType volatile *
, ValueType volatile const * ) {}
};
template< class F , INTERFACE I >
struct DeduceJoin< F , I ,
decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
: public has_join_function<F,I> {};
//----------------------------------------
template< class , INTERFACE >
struct has_init_function ;
template< class F >
struct has_init_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( *dst ); }
};
template< class F >
struct has_init_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), *dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), dst ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceInit
{
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
};
template< class F >
struct DeduceInit< F , DISABLE , void >
{
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceInit< F , I ,
decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
: public has_init_function<F,I> {};
//----------------------------------------
public:
struct Reducer
{
private:
Functor const & m_functor ;
ValueType * const m_result ;
int const m_length ;
public:
using reducer = Reducer ;
using value_type = FunctorAnalysis::value_type ;
using memory_space = void ;
using reference_type = FunctorAnalysis::reference_type ;
KOKKOS_INLINE_FUNCTION
void join( ValueType volatile * dst
, ValueType volatile const * src ) const noexcept
{ DeduceJoin<>::join( m_functor , dst , src ); }
KOKKOS_INLINE_FUNCTION
void init( ValueType * dst ) const noexcept
{ DeduceInit<>::init( m_functor , dst ); }
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( Functor const & arg_functor
, ValueType * arg_value = 0
, int arg_length = 0 ) noexcept
: m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
KOKKOS_INLINE_FUNCTION
constexpr int length() const noexcept { return m_length ; }
KOKKOS_INLINE_FUNCTION
ValueType & operator[]( int i ) const noexcept
{ return m_result[i]; }
private:
template< bool IsArray >
constexpr
typename std::enable_if< IsArray , ValueType * >::type
ref() const noexcept { return m_result ; }
template< bool IsArray >
constexpr
typename std::enable_if< ! IsArray , ValueType & >::type
ref() const noexcept { return *m_result ; }
public:
KOKKOS_INLINE_FUNCTION
auto result() const noexcept
-> decltype( Reducer::template ref< candidate_is_array >() )
{ return Reducer::template ref< candidate_is_array >(); }
};
//----------------------------------------
private:
template< class , INTERFACE >
struct has_final_function ;
// No tag, not array
template< class F >
struct has_final_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( *dst ); }
};
// No tag, is array
template< class F >
struct has_final_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( dst ); }
};
// Has tag, not array
template< class F >
struct has_final_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), *dst ); }
};
// Has tag, is array
template< class F >
struct has_final_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), dst ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceFinal
{
KOKKOS_INLINE_FUNCTION
static void final( F const & , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceFinal< F , I ,
decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
: public has_init_function<F,I> {};
public:
static void final( Functor const & f , ValueType * result )
{ DeduceFinal<>::final( f , result ); }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_FUNCTORANALYSIS_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -62,7 +62,7 @@
#include <memkind.h>
#endif
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
@ -198,7 +198,7 @@ void * HBWSpace::allocate( const size_t arg_alloc_size ) const
case STD_MALLOC: msg << "STD_MALLOC" ; break ;
}
msg << " ]( " << arg_alloc_size << " ) FAILED" ;
if ( ptr == NULL ) { msg << " NULL" ; }
if ( ptr == NULL ) { msg << " NULL" ; }
else { msg << " NOT ALIGNED " << ptr ; }
std::cerr << msg.str() << std::endl ;
@ -218,7 +218,7 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
if ( m_alloc_mech == STD_MALLOC ) {
void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
memkind_free(MEMKIND_TYPE, alloc_ptr );
}
}
}
}
@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
@ -297,7 +297,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
void * SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
, const std::string & arg_alloc_label
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,14 +36,14 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <algorithm>
#include <Kokkos_Macros.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
/*--------------------------------------------------------------------------*/
@ -292,7 +292,7 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
case INTEL_MM_ALLOC: msg << "INTEL_MM_ALLOC" ; break ;
}
msg << " ]( " << arg_alloc_size << " ) FAILED" ;
if ( ptr == NULL ) { msg << " NULL" ; }
if ( ptr == NULL ) { msg << " NULL" ; }
else { msg << " NOT ALIGNED " << ptr ; }
std::cerr << msg.str() << std::endl ;
@ -312,7 +312,7 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
if ( m_alloc_mech == STD_MALLOC ) {
void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
free( alloc_ptr );
}
}
#if defined( KOKKOS_ENABLE_INTEL_MM_ALLOC )
else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::HostSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
@ -406,7 +406,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
void * SharedAllocationRecord< Kokkos::HostSpace , void >::
allocate_tracked( const Kokkos::HostSpace & arg_space
, const std::string & arg_alloc_label
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;

View File

@ -0,0 +1,463 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <limits>
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void HostThreadTeamData::organize_pool
( HostThreadTeamData * members[] , const int size )
{
bool ok = true ;
// Verify not already a member of a pool:
for ( int rank = 0 ; rank < size && ok ; ++rank ) {
ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
}
if ( ok ) {
int64_t * const root_scratch = members[0]->m_scratch ;
for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
root_scratch[i] = 0 ;
}
{
HostThreadTeamData ** const pool =
(HostThreadTeamData **) (root_scratch + m_pool_members);
// team size == 1, league size == pool_size
for ( int rank = 0 ; rank < size ; ++rank ) {
HostThreadTeamData * const mem = members[ rank ] ;
mem->m_pool_scratch = root_scratch ;
mem->m_team_scratch = mem->m_scratch ;
mem->m_pool_rank = rank ;
mem->m_pool_size = size ;
mem->m_team_base = rank ;
mem->m_team_rank = 0 ;
mem->m_team_size = 1 ;
mem->m_team_alloc = 1 ;
mem->m_league_rank = rank ;
mem->m_league_size = size ;
mem->m_pool_rendezvous_step = 0 ;
mem->m_team_rendezvous_step = 0 ;
pool[ rank ] = mem ;
}
}
Kokkos::memory_fence();
}
else {
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
}
}
void HostThreadTeamData::disband_pool()
{
m_work_range.first = -1 ;
m_work_range.second = -1 ;
m_pool_scratch = 0 ;
m_team_scratch = 0 ;
m_pool_rank = 0 ;
m_pool_size = 1 ;
m_team_base = 0 ;
m_team_rank = 0 ;
m_team_size = 1 ;
m_team_alloc = 1 ;
m_league_rank = 0 ;
m_league_size = 1 ;
m_pool_rendezvous_step = 0 ;
m_team_rendezvous_step = 0 ;
}
int HostThreadTeamData::organize_team( const int team_size )
{
// Pool is initialized
const bool ok_pool = 0 != m_pool_scratch ;
// Team is not set
const bool ok_team =
m_team_scratch == m_scratch &&
m_team_base == m_pool_rank &&
m_team_rank == 0 &&
m_team_size == 1 &&
m_team_alloc == 1 &&
m_league_rank == m_pool_rank &&
m_league_size == m_pool_size ;
if ( ok_pool && ok_team ) {
if ( team_size <= 0 ) return 0 ; // No teams to organize
if ( team_size == 1 ) return 1 ; // Already organized in teams of one
HostThreadTeamData * const * const pool =
(HostThreadTeamData **) (m_pool_scratch + m_pool_members);
// "league_size" in this context is the number of concurrent teams
// that the pool can accommodate. Excess threads are idle.
const int league_size = m_pool_size / team_size ;
const int team_alloc_size = m_pool_size / league_size ;
const int team_alloc_rank = m_pool_rank % team_alloc_size ;
const int league_rank = m_pool_rank / team_alloc_size ;
const int team_base_rank = league_rank * team_alloc_size ;
m_team_scratch = pool[ team_base_rank ]->m_scratch ;
m_team_base = team_base_rank ;
// This needs to check overflow, if m_pool_size % team_alloc_size !=0
// there are two corner cases:
// (i) if team_alloc_size == team_size there might be a non-full
// zombi team around (for example m_pool_size = 5 and team_size = 2
// (ii) if team_alloc > team_size then the last team might have less
// threads than the others
m_team_rank = ( team_base_rank + team_size <= m_pool_size ) &&
( team_alloc_rank < team_size ) ?
team_alloc_rank : -1;
m_team_size = team_size ;
m_team_alloc = team_alloc_size ;
m_league_rank = league_rank ;
m_league_size = league_size ;
m_team_rendezvous_step = 0 ;
if ( team_base_rank == m_pool_rank ) {
// Initialize team's rendezvous memory
for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
m_scratch[i] = 0 ;
}
// Make sure team's rendezvous memory initialized
// is written before proceeding.
Kokkos::memory_fence();
}
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
// rendezvous mechanism before a team rendezvous can be performed.
if ( pool_rendezvous() ) {
pool_rendezvous_release();
}
}
else {
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
}
return 0 <= m_team_rank ;
}
void HostThreadTeamData::disband_team()
{
m_team_scratch = m_scratch ;
m_team_base = m_pool_rank ;
m_team_rank = 0 ;
m_team_size = 1 ;
m_team_alloc = 1 ;
m_league_rank = m_pool_rank ;
m_league_size = m_pool_size ;
m_team_rendezvous_step = 0 ;
}
//----------------------------------------------------------------------------
/* pattern for rendezvous
*
* if ( rendezvous() ) {
* ... all other threads are still in team_rendezvous() ...
* rendezvous_release();
* ... all other threads are released from team_rendezvous() ...
* }
*/
int HostThreadTeamData::rendezvous( int64_t * const buffer
, int & rendezvous_step
, int const size
, int const rank ) noexcept
{
enum : int { shift_byte = 3 };
enum : int { size_byte = ( 01 << shift_byte ) }; // == 8
enum : int { mask_byte = size_byte - 1 };
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
// Cycle step values: 1 <= step <= size_val_cycle
// An odd multiple of memory cycle so that when a memory location
// is reused it has a different value.
// Must be representable within a single byte: size_val_cycle < 16
enum : int { size_val_cycle = 3 * size_mem_cycle };
// Requires:
// Called by rank = [ 0 .. size )
// buffer aligned to int64_t[4]
// A sequence of rendezvous uses four cycled locations in memory
// and non-equal cycled synchronization values to
// 1) prevent rendezvous from overtaking one another and
// 2) give each spin wait location an int64_t[4] span
// so that it has its own cache line.
const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
rendezvous_step = step ;
// The leading int64_t[4] span is for thread 0 to write
// and all other threads to read spin-wait.
// sync_offset is the index into this array for this step.
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
union {
int64_t full ;
int8_t byte[8] ;
} value ;
if ( rank ) {
const int group_begin = rank << shift_byte ; // == rank * size_byte
if ( group_begin < size ) {
// This thread waits for threads
// [ group_begin .. group_begin + 8 )
// [ rank*8 .. rank*8 + 8 )
// to write to their designated bytes.
const int end = group_begin + size_byte < size
? size_byte : size - group_begin ;
value.full = 0 ;
for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
store_fence(); // This should not be needed but fixes #742
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value.full );
}
{
// This thread sets its designated byte.
// ( rank % size_byte ) +
// ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
// ( sync_offset * size_byte )
const int offset = ( rank & mask_byte )
+ ( ( rank & ~mask_byte ) << shift_mem_cycle )
+ ( sync_offset << shift_byte );
// All of this thread's previous memory stores must be complete before
// this thread stores the step value at this thread's designated byte
// in the shared synchronization array.
Kokkos::memory_fence();
((volatile int8_t*) buffer)[ offset ] = int8_t( step );
// Memory fence to push the previous store out
Kokkos::memory_fence();
}
// Wait for thread 0 to release all other threads
spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
}
else {
// Thread 0 waits for threads [1..7]
// to write to their designated bytes.
const int end = size_byte < size ? 8 : size ;
value.full = 0 ;
for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
spinwait_until_equal( buffer[ sync_offset ], value.full );
}
return rank ? 0 : 1 ;
}
void HostThreadTeamData::
rendezvous_release( int64_t * const buffer
, int const rendezvous_step ) noexcept
{
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
// Requires:
// Called after team_rendezvous
// Called only by true == team_rendezvous(root)
// Memory fence to be sure all previous writes are complete:
Kokkos::memory_fence();
((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
int64_t( rendezvous_step );
// Memory fence to push the store out
Kokkos::memory_fence();
}
//----------------------------------------------------------------------------
int HostThreadTeamData::get_work_stealing() noexcept
{
pair_int_t w( -1 , -1 );
if ( 1 == m_team_size || team_rendezvous() ) {
// Attempt first from beginning of my work range
for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
// Query and attempt to update m_work_range
// from: [ w.first , w.second )
// to: [ w.first + 1 , w.second ) = w_new
//
// If w is invalid then is just a query.
const pair_int_t w_new( w.first + 1 , w.second );
w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
if ( w.first < w.second ) {
// m_work_range is viable
// If steal is successful then don't repeat attempt to steal
attempt = ! ( w_new.first == w.first + 1 &&
w_new.second == w.second );
}
else {
// m_work_range is not viable
w.first = -1 ;
w.second = -1 ;
attempt = 0 ;
}
}
if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
HostThreadTeamData * const * const pool =
(HostThreadTeamData**)( m_pool_scratch + m_pool_members );
// Attempt from begining failed, try to steal from end of neighbor
pair_int_t volatile * steal_range =
& ( pool[ m_steal_rank ]->m_work_range );
for ( int attempt = true ; attempt ; ) {
// Query and attempt to update steal_work_range
// from: [ w.first , w.second )
// to: [ w.first , w.second - 1 ) = w_new
//
// If w is invalid then is just a query.
const pair_int_t w_new( w.first , w.second - 1 );
w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
if ( w.first < w.second ) {
// steal_work_range is viable
// If steal is successful then don't repeat attempt to steal
attempt = ! ( w_new.first == w.first &&
w_new.second == w.second - 1 );
}
else {
// steal_work_range is not viable, move to next member
w.first = -1 ;
w.second = -1 ;
// We need to figure out whether the next team is active
// m_steal_rank + m_team_alloc could be the next base_rank to steal from
// but only if there are another m_team_size threads available so that that
// base rank has a full team.
m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
m_steal_rank + m_team_alloc : 0;
steal_range = & ( pool[ m_steal_rank ]->m_work_range );
// If tried all other members then don't repeat attempt to steal
attempt = m_steal_rank != m_pool_rank ;
}
}
if ( w.first != -1 ) w.first = w.second - 1 ;
}
if ( 1 < m_team_size ) {
// Must share the work index
*((int volatile *) team_reduce()) = w.first ;
team_rendezvous_release();
}
}
else if ( 1 < m_team_size ) {
w.first = *((int volatile *) team_reduce());
}
// May exit because successfully stole work and w is good.
// May exit because no work left to steal and w = (-1,-1).
#if 0
fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
, m_pool_rank , m_pool_size , w.first );
fflush(stdout);
#endif
return w.first ;
}
} // namespace Impl
} // namespace Kokkos

File diff suppressed because it is too large Load Diff

View File

@ -52,6 +52,10 @@ void memory_fence()
{
#if defined( __CUDA_ARCH__ )
__threadfence();
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"mfence" ::: "memory"
);
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
__sync_synchronize();
@ -76,8 +80,8 @@ void store_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"sfence" ::: "memory"
);
"sfence" ::: "memory"
);
#else
memory_fence();
#endif
@ -93,8 +97,8 @@ void load_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"lfence" ::: "memory"
);
"lfence" ::: "memory"
);
#else
memory_fence();
#endif

View File

@ -129,8 +129,8 @@
#endif
#ifdef KOKKOS_HAVE_CUDA_RDC
#ifndef KOKKOS_ENABLE_CUDA_RDC
#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
#endif
#endif
@ -242,9 +242,9 @@
#endif
#endif
#ifdef KOKKOS_HAVE_QTHREAD
#ifndef KOKKOS_ENABLE_QTHREAD
#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
#ifdef KOKKOS_HAVE_QTHREADS
#ifndef KOKKOS_ENABLE_QTHREADS
#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
#endif
#endif

View File

@ -43,7 +43,7 @@
#include <impl/Kokkos_Profiling_Interface.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <string.h>
namespace Kokkos {
@ -84,21 +84,21 @@ namespace Kokkos {
(*endScanCallee)(kernelID);
}
}
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginReduceCallee) {
Kokkos::fence();
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelReduce(const uint64_t kernelID) {
if(NULL != endReduceCallee) {
Kokkos::fence();
(*endReduceCallee)(kernelID);
}
}
void pushRegion(const std::string& kName) {
if( NULL != pushRegionCallee ) {

View File

@ -50,7 +50,7 @@
#include <string>
#include <cinttypes>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
#include <dlfcn.h>
#include <iostream>
@ -59,7 +59,7 @@
#define KOKKOSP_INTERFACE_VERSION 20150628
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
namespace Kokkos {
namespace Profiling {

View File

@ -0,0 +1,317 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_REDUCER_HPP
#define KOKKOS_IMPL_REDUCER_HPP
#include <impl/Kokkos_Traits.hpp>
//----------------------------------------------------------------------------
/* Reducer abstraction:
* 1) Provides 'join' operation
* 2) Provides 'init' operation
* 3) Provides 'copy' operation
* 4) Optionally provides result value in a memory space
*
* Created from:
* 1) Functor::operator()( destination , source )
* 2) Functor::{ join , init )
*/
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< typename value_type >
struct ReduceSum
{
KOKKOS_INLINE_FUNCTION static
void copy( value_type & dest
, value_type const & src ) noexcept
{ dest = src ; }
KOKKOS_INLINE_FUNCTION static
void init( value_type & dest ) noexcept
{ new( &dest ) value_type(); }
KOKKOS_INLINE_FUNCTION static
void join( value_type volatile & dest
, value_type const volatile & src ) noexcept
{ dest += src ; }
KOKKOS_INLINE_FUNCTION static
void join( value_type & dest
, value_type const & src ) noexcept
{ dest += src ; }
};
template< typename T
, class ReduceOp = ReduceSum< T >
, typename MemorySpace = void >
struct Reducer
: private ReduceOp
, private integral_nonzero_constant
< int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
{
private:
// Determine if T is simple array
enum : int { rank = std::rank<T>::value };
static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
using length_t =
integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
public:
using reducer = Reducer ;
using memory_space = MemorySpace ;
using value_type = typename std::remove_extent<T>::type ;
using reference_type =
typename std::conditional< ( rank != 0 )
, value_type *
, value_type &
>::type ;
private:
//--------------------------------------------------------------------------
// Determine what functions 'ReduceOp' provides:
// copy( destination , source )
// init( destination )
//
// operator()( destination , source )
// join( destination , source )
//
// Provide defaults for missing optional operations
template< class R , typename = void>
struct COPY {
KOKKOS_INLINE_FUNCTION static
void copy( R const &
, value_type * dst
, value_type const * src ) { *dst = *src ; }
};
template< class R >
struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
, *((value_type const *)0) ) ) >
{
KOKKOS_INLINE_FUNCTION static
void copy( R const & r
, value_type * dst
, value_type const * src ) { r.copy( *dst , *src ); }
};
template< class R , typename = void >
struct INIT {
KOKKOS_INLINE_FUNCTION static
void init( R const & , value_type * dst ) { new(dst) value_type(); }
};
template< class R >
struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
{
KOKKOS_INLINE_FUNCTION static
void init( R const & r , value_type * dst ) { r.init( *dst ); }
};
template< class R , typename V , typename = void > struct JOIN
{
// If no join function then try operator()
KOKKOS_INLINE_FUNCTION static
void join( R const & r , V * dst , V const * src )
{ r.operator()(*dst,*src); }
};
template< class R , typename V >
struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
{
// If has join function use it
KOKKOS_INLINE_FUNCTION static
void join( R const & r , V * dst , V const * src )
{ r.join(*dst,*src); }
};
//--------------------------------------------------------------------------
value_type * const m_result ;
template< int Rank >
KOKKOS_INLINE_FUNCTION
static constexpr
typename std::enable_if< ( 0 != Rank ) , reference_type >::type
ref( value_type * p ) noexcept { return p ; }
template< int Rank >
KOKKOS_INLINE_FUNCTION
static constexpr
typename std::enable_if< ( 0 == Rank ) , reference_type >::type
ref( value_type * p ) noexcept { return *p ; }
public:
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
constexpr int length() const noexcept
{ return length_t::value ; }
KOKKOS_INLINE_FUNCTION
value_type * data() const noexcept
{ return m_result ; }
KOKKOS_INLINE_FUNCTION
reference_type reference() const noexcept
{ return Reducer::template ref< rank >( m_result ); }
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
void copy( value_type * const dest
, value_type const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
}
}
KOKKOS_INLINE_FUNCTION
void init( value_type * dest ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
}
}
KOKKOS_INLINE_FUNCTION
void join( value_type * const dest
, value_type const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
}
}
KOKKOS_INLINE_FUNCTION
void join( value_type volatile * const dest
, value_type volatile const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
}
}
//--------------------------------------------------------------------------
template< typename ArgT >
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer
( ArgT * arg_value
, typename std::enable_if
< std::is_same<ArgT,value_type>::value &&
std::is_default_constructible< ReduceOp >::value
, int >::type arg_length = 1
) noexcept
: ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( ReduceOp const & arg_op
, value_type * arg_value = 0
, int arg_length = 1 ) noexcept
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( ReduceOp && arg_op
, value_type * arg_value = 0
, int arg_length = 1 ) noexcept
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
Reducer( Reducer const & ) = default ;
Reducer( Reducer && ) = default ;
Reducer & operator = ( Reducer const & ) = default ;
Reducer & operator = ( Reducer && ) = default ;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< typename ValueType >
constexpr
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
Sum( ValueType & arg_value )
{
static_assert( std::is_trivial<ValueType>::value
, "Kokkos reducer requires trivial value type" );
return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
}
template< typename ValueType >
constexpr
Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
Sum( ValueType * arg_value , int arg_length )
{
static_assert( std::is_trivial<ValueType>::value
, "Kokkos reducer requires trivial value type" );
return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
}
//----------------------------------------------------------------------------
template< typename ValueType , class JoinType >
Impl::Reducer< ValueType , JoinType >
reducer( ValueType & value , JoinType const & lambda )
{
return Impl::Reducer< ValueType , JoinType >( lambda , & value );
}
} // namespace Kokkos
#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -53,63 +53,126 @@
namespace Kokkos {
namespace Impl {
namespace SerialImpl {
namespace {
Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
HostThreadTeamData g_serial_thread_team_data ;
Sentinel::~Sentinel()
{
if ( m_scratch ) { free( m_scratch ); }
m_scratch = 0 ;
m_reduce_end = 0 ;
m_shared_end = 0 ;
}
Sentinel & Sentinel::singleton()
// Resize thread team data scratch memory
void serial_resize_thread_team_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes )
{
static Sentinel s ; return s ;
if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
// Allocate if any of the old allocation is tool small:
const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) ||
( old_team_reduce < team_reduce_bytes ) ||
( old_team_shared < team_shared_bytes ) ||
( old_thread_local < thread_local_bytes );
if ( allocate ) {
Kokkos::HostSpace space ;
if ( old_alloc_bytes ) {
g_serial_thread_team_data.disband_team();
g_serial_thread_team_data.disband_pool();
space.deallocate( g_serial_thread_team_data.scratch_buffer()
, g_serial_thread_team_data.scratch_bytes() );
}
if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
const size_t alloc_bytes =
HostThreadTeamData::scratch_size( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
void * const ptr = space.allocate( alloc_bytes );
g_serial_thread_team_data.
scratch_assign( ((char *)ptr)
, alloc_bytes
, pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
g_serial_thread_team_data.organize_pool( pool , 1 );
g_serial_thread_team_data.organize_team(1);
}
}
inline
unsigned align( unsigned n )
// Get thread team data structure for omp_get_thread_num()
HostThreadTeamData * serial_get_thread_team_data()
{
enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
return ( n + MASK ) & ~MASK ;
return & g_serial_thread_team_data ;
}
} // namespace
SerialTeamMember::SerialTeamMember( int arg_league_rank
, int arg_league_size
, int arg_shared_size
)
: m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
, arg_shared_size )
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
{}
} // namespace Impl
} // namespace Kokkos
void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
/*--------------------------------------------------------------------------*/
namespace Kokkos {
int Serial::is_initialized()
{
static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
return 1 ;
}
reduce_size = Impl::SerialImpl::align( reduce_size );
shared_size = Impl::SerialImpl::align( shared_size );
void Serial::initialize( unsigned threads_count
, unsigned use_numa_count
, unsigned use_cores_per_numa
, bool allow_asynchronous_threadpool )
{
(void) threads_count;
(void) use_numa_count;
(void) use_cores_per_numa;
(void) allow_asynchronous_threadpool;
if ( ( s.m_reduce_end < reduce_size ) ||
( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
if ( s.m_scratch ) { free( s.m_scratch ); }
void Serial::finalize()
{
if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
Impl::g_serial_thread_team_data.disband_team();
Impl::g_serial_thread_team_data.disband_pool();
if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
Kokkos::HostSpace space ;
s.m_scratch = malloc( s.m_shared_end );
space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
, Impl::g_serial_thread_team_data.scratch_bytes() );
Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
}
return s.m_scratch ;
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
} // namespace Kokkos

View File

@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member exec ;
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
Member exec( *data );
// Loop until all queues are empty
while ( 0 < queue->m_ready_count ) {
@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}
if ( end != task ) {
// pop_task resulted in lock == task->m_next
// pop_ready_task resulted in lock == task->m_next
// In the executing state
(*task->m_apply)( task , & exec );
@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member exec ;
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
Member exec( *data );
// Loop until no runnable task
@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}

View File

@ -65,6 +65,7 @@ public:
using memory_space = Kokkos::HostSpace ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
static
void iff_single_thread_recursive_execute( queue_type * const );
@ -72,237 +73,19 @@ public:
static
void execute( queue_type * const );
template< typename FunctorType >
template< typename TaskType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::Serial
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
typename TaskType::function_type
get_function_pointer() { return TaskType::apply ; }
};
extern template class TaskQueue< Kokkos::Serial > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::Serial >
{
public:
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
};
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
{
typedef iType index_type;
const iType start ;
const iType end ;
enum {increment = 1};
//const TaskExec< Kokkos::Serial > & thread;
TaskExec< Kokkos::Serial > & thread;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct
//( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
: start(0)
, end(arg_count)
, thread(arg_thread)
{}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct
//( const TaskExec< Kokkos::Serial > & arg_thread
( TaskExec< Kokkos::Serial > & arg_thread
, const iType& arg_start
, const iType & arg_end
)
: start( arg_start )
, end( arg_end)
, thread( arg_thread )
{}
};
//----------------------------------------------------------------------------
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
{
typedef iType index_type;
const iType start ;
const iType end ;
enum {increment = 1};
TaskExec< Kokkos::Serial > & thread;
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
: start( 0 )
, end(arg_count)
, thread(arg_thread)
{}
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
// OMP version needs non-const TaskExec
template< typename iType >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
}
// OMP version needs non-const TaskExec
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::TaskExec< Kokkos::Serial > >
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
{
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
thread, iType(start), iType(end) );
}
// OMP version needs non-const TaskExec
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
ThreadVectorRange
( Impl::TaskExec< Kokkos::Serial > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i, result);
initialized_result = result;
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i, result);
initialized_result = result;
}
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
initialized_result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
initialized_result+=tmp;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
ValueType result = initialized_result;
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
initialized_result = result;
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */

View File

@ -1,693 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_HPP
#define KOKKOS_SYNCHRONIC_HPP
#include <impl/Kokkos_Synchronic_Config.hpp>
#include <atomic>
#include <chrono>
#include <thread>
#include <functional>
#include <algorithm>
namespace Kokkos {
namespace Impl {
enum notify_hint {
notify_all,
notify_one,
notify_none
};
enum expect_hint {
expect_urgent,
expect_delay
};
namespace Details {
template <class S, class T>
bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
int i = 0;
for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
return true;
else
__synchronic_relax();
for(;i < attempts; ++i)
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
return true;
else
__synchronic_yield();
return false;
}
struct __exponential_backoff {
__exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
}
static inline void sleep_for(std::chrono::microseconds const& time) {
auto t = time.count();
if(__builtin_expect(t > 75,0)) {
portable_sleep(time);
}
else if(__builtin_expect(t > 25,0))
__synchronic_yield();
else
__synchronic_relax();
}
void sleep_for_step() {
sleep_for(step());
}
std::chrono::microseconds step() {
float const f = ranfu();
int const t = int(microseconds * f);
if(__builtin_expect(f >= 0.95f,0))
microseconds = 8;
else
microseconds = (std::min)(microseconds>>1,maximum);
return std::chrono::microseconds(t);
}
private :
int maximum, microseconds, x, y, z;
int xorshf96() {
int t;
x ^= x << 16; x ^= x >> 5; x ^= x << 1;
t = x; x = y; y = z; z = t ^ x ^ y;
return z;
}
float ranfu() {
return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
}
};
template <class T, class Enable = void>
struct __synchronic_base {
protected:
std::atomic<T> atom;
void notify(notify_hint = notify_all) noexcept {
}
void notify(notify_hint = notify_all) volatile noexcept {
}
public :
__synchronic_base() noexcept = default;
constexpr __synchronic_base(T v) noexcept : atom(v) { }
__synchronic_base(const __synchronic_base&) = delete;
~__synchronic_base() { }
__synchronic_base& operator=(const __synchronic_base&) = delete;
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
while(atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
}
}
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
while(atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
remains = then - std::chrono::high_resolution_clock::now();
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
remains = then - std::chrono::high_resolution_clock::now();
}
}
};
#ifdef __SYNCHRONIC_COMPATIBLE
template <class T>
struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
public:
std::atomic<T> atom;
void notify(notify_hint hint = notify_all) noexcept {
if(__builtin_expect(hint == notify_none,1))
return;
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
if(__builtin_expect(x,0)) {
if(__builtin_expect(hint == notify_all,1))
__synchronic_wake_all(&atom);
else
__synchronic_wake_one(&atom);
}
}
void notify(notify_hint hint = notify_all) volatile noexcept {
if(__builtin_expect(hint == notify_none,1))
return;
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
if(__builtin_expect(x,0)) {
if(__builtin_expect(hint == notify_all,1))
__synchronic_wake_all_volatile(&atom);
else
__synchronic_wake_one_volatile(&atom);
}
}
public :
__synchronic_base() noexcept : count(0) { }
constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
__synchronic_base(const __synchronic_base&) = delete;
~__synchronic_base() { }
__synchronic_base& operator=(const __synchronic_base&) = delete;
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait(&atom,val);
count.fetch_add(-1,std::memory_order_acquire);
}
}
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_volatile(&atom,val);
count.fetch_add(-1,std::memory_order_acquire);
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_timed(&atom,val,remains);
count.fetch_add(-1,std::memory_order_acquire);
remains = then - std::chrono::high_resolution_clock::now();
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_timed_volatile(&atom,val,remains);
count.fetch_add(-1,std::memory_order_acquire);
remains = then - std::chrono::high_resolution_clock::now();
}
}
private:
mutable std::atomic<int> count;
};
#endif
template <class T, class Enable = void>
struct __synchronic : public __synchronic_base<T> {
__synchronic() noexcept = default;
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
};
template <class T>
struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_and(v,m);
this->notify(n);
return t;
}
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_and(v,m);
this->notify(n);
return t;
}
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_or(v,m);
this->notify(n);
return t;
}
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_or(v,m);
this->notify(n);
return t;
}
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_xor(v,m);
this->notify(n);
return t;
}
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_xor(v,m);
this->notify(n);
return t;
}
__synchronic() noexcept = default;
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
T operator=(T v) volatile noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T operator=(T v) noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T operator++(int) volatile noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T operator++(int) noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T operator--(int) volatile noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T operator--(int) noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T operator++() volatile noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T operator++() noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T operator--() volatile noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T operator--() noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T operator+=(T v) volatile noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T operator+=(T v) noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T operator-=(T v) volatile noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T operator-=(T v) noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T operator&=(T v) volatile noexcept {
auto const t = this->atom &= v;
this->notify();
return t;
}
T operator&=(T v) noexcept {
auto const t = this->atom &= v;
this->notify();
return t;
}
T operator|=(T v) volatile noexcept {
auto const t = this->atom |= v;
this->notify();
return t;
}
T operator|=(T v) noexcept {
auto const t = this->atom |= v;
this->notify();
return t;
}
T operator^=(T v) volatile noexcept {
auto const t = this->atom ^= v;
this->notify();
return t;
}
T operator^=(T v) noexcept {
auto const t = this->atom ^= v;
this->notify();
return t;
}
};
template <class T>
struct __synchronic<T*> : public __synchronic_base<T*> {
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
__synchronic() noexcept = default;
constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
T* operator=(T* v) volatile noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T* operator=(T* v) noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T* operator++(int) volatile noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T* operator++(int) noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T* operator--(int) volatile noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T* operator--(int) noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T* operator++() volatile noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T* operator++() noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T* operator--() volatile noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T* operator--() noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T* operator+=(ptrdiff_t v) volatile noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T* operator+=(ptrdiff_t v) noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T* operator-=(ptrdiff_t v) volatile noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T* operator-=(ptrdiff_t v) noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
};
} //namespace Details
template <class T>
struct synchronic : public Details::__synchronic<T> {
bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
this->atom.store(v,m);
this->notify(n);
}
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
this->atom.store(v,m);
this->notify(n);
}
T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
operator T() const volatile noexcept { return (T)this->atom; }
operator T() const noexcept { return (T)this->atom; }
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.exchange(v,m);
this->notify(n);
return t;
}
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.exchange(v,m);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m);
this->notify(n);
return t;
}
synchronic() noexcept = default;
constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
synchronic(const synchronic&) = delete;
~synchronic() { }
synchronic& operator=(const synchronic&) = delete;
synchronic& operator=(const synchronic&) volatile = delete;
T operator=(T val) noexcept {
return Details::__synchronic<T>::operator=(val);
}
T operator=(T val) volatile noexcept {
return Details::__synchronic<T>::operator=(val);
}
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
Details::__synchronic<T>::expect_update(val,h);
return load(order);
}
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
Details::__synchronic<T>::expect_update(val,h);
return load(order);
}
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
Details::__synchronic<T>::expect_update(nval,h);
return load(order);
}
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
expect_update(nval,h);
return load(order);
}
template <class Rep, class Period>
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
}
template < class Rep, class Period>
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
}
};
#include <inttypes.h>
typedef synchronic<char> synchronic_char;
typedef synchronic<char> synchronic_schar;
typedef synchronic<unsigned char> synchronic_uchar;
typedef synchronic<short> synchronic_short;
typedef synchronic<unsigned short> synchronic_ushort;
typedef synchronic<int> synchronic_int;
typedef synchronic<unsigned int> synchronic_uint;
typedef synchronic<long> synchronic_long;
typedef synchronic<unsigned long> synchronic_ulong;
typedef synchronic<long long> synchronic_llong;
typedef synchronic<unsigned long long> synchronic_ullong;
//typedef synchronic<char16_t> synchronic_char16_t;
//typedef synchronic<char32_t> synchronic_char32_t;
typedef synchronic<wchar_t> synchronic_wchar_t;
typedef synchronic<int_least8_t> synchronic_int_least8_t;
typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
typedef synchronic<int_least16_t> synchronic_int_least16_t;
typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
typedef synchronic<int_least32_t> synchronic_int_least32_t;
typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
typedef synchronic<intptr_t> synchronic_intptr_t;
typedef synchronic<uintptr_t> synchronic_uintptr_t;
typedef synchronic<size_t> synchronic_size_t;
typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
typedef synchronic<intmax_t> synchronic_intmax_t;
typedef synchronic<uintmax_t> synchronic_uintmax_t;
}
}
#endif //__SYNCHRONIC_H

View File

@ -1,169 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
#define KOKKOS_SYNCHRONIC_CONFIG_H
#include <thread>
#include <chrono>
namespace Kokkos {
namespace Impl {
//the default yield function used inside the implementation is the Standard one
#define __synchronic_yield std::this_thread::yield
#define __synchronic_relax __synchronic_yield
#if defined(_MSC_VER)
//this is a handy GCC optimization that I use inside the implementation
#define __builtin_expect(condition,common) condition
#if _MSC_VER <= 1800
//using certain keywords that VC++ temporarily doesn't support
#define _ALLOW_KEYWORD_MACROS
#define noexcept
#define constexpr
#endif
//yes, I define multiple assignment operators
#pragma warning(disable:4522)
//I don't understand how Windows is so bad at timing functions, but is OK
//with straight-up yield loops
#define __do_backoff(b) __synchronic_yield()
#else
#define __do_backoff(b) b.sleep_for_step()
#endif
//certain platforms have efficient support for spin-waiting built into the operating system
#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
#if defined(_WIN32_WINNT)
#include <winsock2.h>
#include <Windows.h>
//the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
#define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
#define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
#define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
#define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
#define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
#define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
#define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
#define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
inline void native_sleep(unsigned long microseconds)
{
// What to do if microseconds is < 1000?
Sleep(microseconds / 1000);
}
inline void native_yield()
{
SwitchToThread();
}
#elif defined(__linux__)
#include <chrono>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
#include <linux/futex.h>
#include <sys/syscall.h>
#include <climits>
#include <cassert>
template < class Rep, class Period>
inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
struct timespec ts;
ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
assert(!ts.tv_sec);
ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
return ts;
}
inline long futex(void const* addr1, int op, int val1) {
return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
}
inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
}
inline void native_sleep(unsigned long microseconds)
{
usleep(microseconds);
}
inline void native_yield()
{
pthread_yield();
}
//the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
#define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
#define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
#define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
#define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
#define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
#define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
#define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
#define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
//the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
#undef __synchronic_yield
#define __synchronic_yield sched_yield
//for extremely short wait times, just let another hyper-thread run
#undef __synchronic_relax
#define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
#endif
#endif
#ifdef _GLIBCXX_USE_NANOSLEEP
inline void portable_sleep(std::chrono::microseconds const& time)
{ std::this_thread::sleep_for(time); }
#else
inline void portable_sleep(std::chrono::microseconds const& time)
{ native_sleep(time.count()); }
#endif
#ifdef _GLIBCXX_USE_SCHED_YIELD
inline void portable_yield()
{ std::this_thread::yield(); }
#else
inline void portable_yield()
{ native_yield(); }
#endif
//this is the number of times we initially spin, on the first wait attempt
#define __SYNCHRONIC_SPIN_COUNT_A 16
//this is how decide to yield instead of just spinning, 'c' is the current trip count
//#define __SYNCHRONIC_SPIN_YIELD(c) true
#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
//this is the number of times we normally spin, on every subsequent wait attempt
#define __SYNCHRONIC_SPIN_COUNT_B 8
}
}
#endif //__SYNCHRONIC_CONFIG_H

View File

@ -1,162 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
#define KOKKOS_SYNCHRONIC_N3998_HPP
#include <impl/Kokkos_Synchronic.hpp>
#include <functional>
/*
In the section below, a synchronization point represents a point at which a
thread may block until a given synchronization condition has been reached or
at which it may notify other threads that a synchronization condition has
been achieved.
*/
namespace Kokkos { namespace Impl {
/*
A latch maintains an internal counter that is initialized when the latch
is created. The synchronization condition is reached when the counter is
decremented to 0. Threads may block at a synchronization point waiting
for the condition to be reached. When the condition is reached, any such
blocked threads will be released.
*/
struct latch {
latch(int val) : count(val), released(false) { }
latch(const latch&) = delete;
latch& operator=(const latch&) = delete;
~latch( ) { }
void arrive( ) {
__arrive( );
}
void arrive_and_wait( ) {
if(!__arrive( ))
wait( );
}
void wait( ) {
while(!released.load_when_not_equal(false,std::memory_order_acquire))
;
}
bool try_wait( ) {
return released.load(std::memory_order_acquire);
}
private:
bool __arrive( ) {
if(count.fetch_add(-1,std::memory_order_release)!=1)
return false;
released.store(true,std::memory_order_release);
return true;
}
std::atomic<int> count;
synchronic<bool> released;
};
/*
A barrier is created with an initial value representing the number of threads
that can arrive at the synchronization point. When that many threads have
arrived, the synchronization condition is reached and the threads are
released. The barrier will then reset, and may be reused for a new cycle, in
which the same set of threads may arrive again at the synchronization point.
The same set of threads shall arrive at the barrier in each cycle, otherwise
the behaviour is undefined.
*/
struct barrier {
barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
barrier(const barrier&) = delete;
barrier& operator=(const barrier&) = delete;
~barrier() { }
void arrive_and_wait() {
int const myepoch = epoch.load(std::memory_order_relaxed);
if(!__arrive(myepoch))
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
;
}
void arrive_and_drop() {
nexpected.fetch_add(-1,std::memory_order_relaxed);
__arrive(epoch.load(std::memory_order_relaxed));
}
private:
bool __arrive(int const myepoch) {
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
if(__builtin_expect(myresult == expected,0)) {
expected = nexpected.load(std::memory_order_relaxed);
arrived.store(0,std::memory_order_relaxed);
epoch.store(myepoch+1,std::memory_order_release);
return true;
}
return false;
}
int expected;
std::atomic<int> arrived, nexpected;
synchronic<int> epoch;
};
/*
A notifying barrier behaves as a barrier, but is constructed with a callable
completion function that is invoked after all threads have arrived at the
synchronization point, and before the synchronization condition is reached.
The completion may modify the set of threads that arrives at the barrier in
each cycle.
*/
struct notifying_barrier {
template <typename T>
notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
notifying_barrier(const notifying_barrier&) = delete;
notifying_barrier& operator=(const notifying_barrier&) = delete;
~notifying_barrier( ) { }
void arrive_and_wait() {
int const myepoch = epoch.load(std::memory_order_relaxed);
if(!__arrive(myepoch))
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
;
}
void arrive_and_drop() {
nexpected.fetch_add(-1,std::memory_order_relaxed);
__arrive(epoch.load(std::memory_order_relaxed));
}
private:
bool __arrive(int const myepoch) {
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
if(__builtin_expect(myresult == expected,0)) {
int const newexpected = completion();
expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
arrived.store(0,std::memory_order_relaxed);
epoch.store(myepoch+1,std::memory_order_release);
return true;
}
return false;
}
int expected;
std::atomic<int> arrived, nexpected;
synchronic<int> epoch;
std::function<int()> completion;
};
}}
#endif //__N3998_H

View File

@ -76,9 +76,6 @@ namespace Impl {
template< typename Space , typename ResultType , typename FunctorType >
class TaskBase ;
template< typename Space >
class TaskExec ;
} /* namespace Impl */
} /* namespace Kokkos */
@ -149,8 +146,8 @@ private:
// task->m_next is the dependence or zero
// Postcondition:
// task->m_next is linked list membership
KOKKOS_FUNCTION
void schedule( task_root_type * const );
KOKKOS_FUNCTION void schedule_runnable( task_root_type * const );
KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
// Reschedule a task
// Precondition:
@ -178,7 +175,7 @@ private:
, task_root_type * const );
KOKKOS_FUNCTION
static task_root_type * pop_task( task_root_type * volatile * const );
static task_root_type * pop_ready_task( task_root_type * volatile * const );
KOKKOS_FUNCTION static
void decrement( task_root_type * task );
@ -368,6 +365,7 @@ public:
int16_t m_task_type ; ///< Type of task
int16_t m_priority ; ///< Priority of runnable task
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
@ -375,17 +373,43 @@ public:
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for a runnable task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase() noexcept
: m_apply(0)
, m_queue(0)
, m_wait(0)
, m_next(0)
, m_ref_count(0)
, m_alloc_size(0)
, m_dep_count(0)
, m_task_type( TaskSingle )
, m_priority( 1 /* TaskRegularPriority */ )
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, TaskBase * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
) noexcept
: m_apply( arg_apply )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( arg_dependence )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( 0 )
, m_task_type( arg_task_type )
, m_priority( arg_priority )
{}
// Constructor for an aggregate task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( queue_type * arg_queue
, int arg_ref_count
, int arg_alloc_size
, int arg_dep_count
) noexcept
: m_apply( 0 )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( 0 )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( arg_dep_count )
, m_task_type( Aggregate )
, m_priority( 0 )
{}
//----------------------------------------
@ -406,9 +430,13 @@ public:
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskBase* dep )
{
// Precondition: lock == m_next
TaskBase * const lock = (TaskBase *) LockTag ;
// Assign dependence to m_next. It will be processed in the subsequent
// call to schedule. Error if the dependence is reset.
if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
}
@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
{
private:
static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
using root_type = TaskBase<ExecSpace,void,void> ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
static_assert( sizeof(root_type) == 48 , "" );
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
@ -444,9 +477,24 @@ public:
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
TaskBase()
: TaskBase< ExecSpace , void , void >()
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
)
: root_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, m_result()
{}
@ -471,11 +519,14 @@ private:
public:
using root_type = TaskBase< ExecSpace , void , void > ;
using base_type = TaskBase< ExecSpace , ResultType , void > ;
using member_type = TaskExec< ExecSpace > ;
using functor_type = FunctorType ;
using result_type = ResultType ;
using root_type = TaskBase< ExecSpace , void , void > ;
using base_type = TaskBase< ExecSpace , ResultType , void > ;
using specialization = TaskQueueSpecialization< ExecSpace > ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
using member_type = typename specialization::member_type ;
using functor_type = FunctorType ;
using result_type = ResultType ;
template< typename Type >
KOKKOS_INLINE_FUNCTION static
@ -522,13 +573,30 @@ public:
if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
// Did not respawn, destroy the functor to free memory.
static_cast<functor_type*>(task)->~functor_type();
// Cannot destroy the task until its dependences have been processed.
// Cannot destroy and deallocate the task until its dependences
// have been processed.
}
}
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
TaskBase( functor_type const & arg_functor )
: base_type()
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
, FunctorType && arg_functor
)
: base_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, functor_type( arg_functor )
{}

View File

@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
)
{
// Push task into a concurrently pushed and popped queue.
// The queue can be either a ready task queue or a waiting task queue.
// The queue is a linked list where 'task->m_next' form the links.
// Fail the push attempt if the queue is locked;
// otherwise retry until the push succeeds.
@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
template< typename ExecSpace >
KOKKOS_FUNCTION
typename TaskQueue< ExecSpace >::task_root_type *
TaskQueue< ExecSpace >::pop_task
TaskQueue< ExecSpace >::pop_ready_task
( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
{
// Pop task from a concurrently pushed and popped queue.
// Pop task from a concurrently pushed and popped ready task queue.
// The queue is a linked list where 'task->m_next' form the links.
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
@ -252,85 +252,201 @@ TaskQueue< ExecSpace >::pop_task
// (1) lock, (2) end, or (3) a valid task.
// Thus zero will never appear in the queue.
//
// If queue is locked then just read by guaranteeing
// the CAS will fail.
// If queue is locked then just read by guaranteeing the CAS will fail.
if ( lock == task ) task = 0 ;
task_root_type * const x = task ;
task = Kokkos::atomic_compare_exchange(queue,task,lock);
task = Kokkos::atomic_compare_exchange(queue,x,lock);
if ( x == task ) break ; // CAS succeeded and queue is locked
}
if ( x == task ) {
// CAS succeeded and queue is locked
//
// This thread has locked the queue and removed 'task' from the queue.
// Extract the next entry of the queue from 'task->m_next'
// and mark 'task' as popped from a queue by setting
// 'task->m_next = lock'.
//
// Place the next entry in the head of the queue,
// which also unlocks the queue.
//
// This thread has exclusive access to
// the queue and the popped task's m_next.
if ( end != task ) {
*queue = task->m_next ; task->m_next = lock ;
// This thread has locked the queue and removed 'task' from the queue.
// Extract the next entry of the queue from 'task->m_next'
// and mark 'task' as popped from a queue by setting
// 'task->m_next = lock'.
Kokkos::memory_fence();
task_root_type * const next =
Kokkos::atomic_exchange( & task->m_next , lock );
#if 0
printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
, uintptr_t(queue)
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
, int(task->m_task_type)
, int(task->m_priority)
, int(task->m_ref_count) );
#endif
// Place the next entry in the head of the queue,
// which also unlocks the queue.
task_root_type * const unlock =
Kokkos::atomic_exchange( queue , next );
if ( next == zero || next == lock || lock != unlock ) {
Kokkos::abort("TaskQueue::pop_task ERROR");
return task ;
}
}
#if 0
if ( end != task ) {
printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
, uintptr_t(queue)
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
, int(task->m_task_type)
, int(task->m_priority)
, int(task->m_ref_count) );
}
#endif
return task ;
return end ;
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::schedule
void TaskQueue< ExecSpace >::schedule_runnable
( TaskQueue< ExecSpace >::task_root_type * const task )
{
// Schedule a runnable or when_all task upon construction / spawn
// Schedule a runnable task upon construction / spawn
// and upon completion of other tasks that 'task' is waiting on.
// Precondition on runnable task state:
// task is either constructing or executing
//
// Precondition:
// - called by a single thread for the input task
// - calling thread has exclusive access to the task
// - task is not a member of a queue
// - if runnable then task is either constructing or respawning
//
// Constructing state:
// task->m_wait == 0
// task->m_next == dependence
// Executing-respawn state:
// task->m_wait == head of linked list
// task->m_next == dependence
// task->m_next == dependence or 0
// Respawn state:
// task->m_wait == head of linked list: 'end' or valid task
// task->m_next == dependence or 0
//
// Task state transition:
// Constructing -> Waiting
// Executing-respawn -> Waiting
// Constructing -> Waiting
// Respawn -> Waiting
//
// Postcondition on task state:
// task->m_wait == head of linked list
// task->m_next == member of linked list
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
, task->m_task_type
, task->m_priority
, task->m_ref_count );
#endif
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
bool respawn = false ;
//----------------------------------------
if ( zero == task->m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
task->m_wait = end ;
// Task in Waiting state
}
else if ( lock != task->m_wait ) {
// Task in Executing state with Respawn request
// - Update dependence
// - Transition to Waiting state
respawn = true ;
}
else {
// Task in Complete state
Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
}
//----------------------------------------
// Scheduling a runnable task which may have a depencency 'dep'.
// Extract dependence, if any, from task->m_next.
// If 'dep' is not null then attempt to push 'task'
// into the wait queue of 'dep'.
// If the push succeeds then 'task' may be
// processed or executed by another thread at any time.
// If the push fails then 'dep' is complete and 'task'
// is ready to execute.
// Exclusive access so don't need an atomic exchange
// task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
task_root_type * dep = task->m_next ; task->m_next = zero ;
const bool is_ready =
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
if ( ( 0 != dep ) && respawn ) {
// Reference count for dep was incremented when
// respawn assigned dependency to task->m_next
// so that if dep completed prior to the
// above push_task dep would not be destroyed.
// dep reference count can now be decremented,
// which may deallocate the task.
TaskQueue::assign( & dep , (task_root_type *)0 );
}
if ( is_ready ) {
// No dependence or 'dep' is complete so push task into ready queue.
// Increment the ready count before pushing into ready queue
// to track number of ready + executing tasks.
// The ready count will be decremented when the task is complete.
Kokkos::atomic_increment( & m_ready_count );
task_root_type * volatile * const ready_queue =
& m_ready[ task->m_priority ][ task->m_task_type ];
// A push_task fails if the ready queue is locked.
// A ready queue is only locked during a push or pop;
// i.e., it is never permanently locked.
// Retry push to ready queue until it succeeds.
// When the push succeeds then 'task' may be
// processed or executed by another thread at any time.
while ( ! push_task( ready_queue , task ) );
}
//----------------------------------------
// Postcondition:
// - A runnable 'task' was pushed into a wait or ready queue.
// - Concurrent execution may have already popped 'task'
// from a queue and processed it as appropriate.
}
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::schedule_aggregate
( TaskQueue< ExecSpace >::task_root_type * const task )
{
// Schedule an aggregate task upon construction
// and upon completion of other tasks that 'task' is waiting on.
//
// Precondition:
// - called by a single thread for the input task
// - calling thread has exclusive access to the task
// - task is not a member of a queue
//
// Constructing state:
// task->m_wait == 0
// task->m_next == dependence or 0
//
// Task state transition:
// Constructing -> Waiting
//
// Postcondition on task state:
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
@ -344,134 +460,85 @@ void TaskQueue< ExecSpace >::schedule
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
//----------------------------------------
{
// If Constructing then task->m_wait == 0
// Change to waiting by task->m_wait = EndTag
task_root_type * const init =
Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
if ( zero == task->m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
// Precondition
if ( lock == init ) {
Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
}
// if ( init == 0 ) Constructing -> Waiting
// else Executing-Respawn -> Waiting
task->m_wait = end ;
// Task in Waiting state
}
else if ( lock == task->m_wait ) {
// Task in Complete state
Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
}
//----------------------------------------
// Scheduling a 'when_all' task with multiple dependences.
// This scheduling may be called when the 'when_all' is
// (1) created or
// (2) being removed from a completed task's wait list.
if ( task_root_type::Aggregate != task->m_task_type ) {
task_root_type ** const aggr = task->aggregate_dependences();
// Scheduling a runnable task which may have a depencency 'dep'.
// Extract dependence, if any, from task->m_next.
// If 'dep' is not null then attempt to push 'task'
// into the wait queue of 'dep'.
// If the push succeeds then 'task' may be
// processed or executed by another thread at any time.
// If the push fails then 'dep' is complete and 'task'
// is ready to execute.
// Assume the 'when_all' is complete until a dependence is
// found that is not complete.
task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
bool is_complete = true ;
const bool is_ready =
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
// Reference count for dep was incremented when assigned
// to task->m_next so that if it completed prior to the
// above push_task dep would not be destroyed.
// dep reference count can now be decremented,
// which may deallocate the task.
TaskQueue::assign( & dep , (task_root_type *)0 );
--i ;
if ( is_ready ) {
// Loop dependences looking for an incomplete task.
// Add this task to the incomplete task's wait queue.
// No dependence or 'dep' is complete so push task into ready queue.
// Increment the ready count before pushing into ready queue
// to track number of ready + executing tasks.
// The ready count will be decremented when the task is complete.
// Remove a task 'x' from the dependence list.
// The reference count of 'x' was incremented when
// it was assigned into the dependence list.
Kokkos::atomic_increment( & m_ready_count );
// Exclusive access so don't need an atomic exchange
// task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
task_root_type * x = aggr[i] ; aggr[i] = zero ;
task_root_type * volatile * const queue =
& m_ready[ task->m_priority ][ task->m_task_type ];
if ( x ) {
// A push_task fails if the ready queue is locked.
// A ready queue is only locked during a push or pop;
// i.e., it is never permanently locked.
// Retry push to ready queue until it succeeds.
// When the push succeeds then 'task' may be
// processed or executed by another thread at any time.
// If x->m_wait is not locked then push succeeds
// and the aggregate is not complete.
// If the push succeeds then this when_all 'task' may be
// processed by another thread at any time.
// For example, 'x' may be completeed by another
// thread and then re-schedule this when_all 'task'.
while ( ! push_task( queue , task ) );
is_complete = ! push_task( & x->m_wait , task );
// Decrement reference count which had been incremented
// when 'x' was added to the dependence list.
TaskQueue::assign( & x , zero );
}
}
//----------------------------------------
else {
// Scheduling a 'when_all' task with multiple dependences.
// This scheduling may be called when the 'when_all' is
// (1) created or
// (2) being removed from a completed task's wait list.
task_root_type ** const aggr = task->aggregate_dependences();
if ( is_complete ) {
// The when_all 'task' was not added to a wait queue because
// all dependences were complete so this aggregate is complete.
// Complete the when_all 'task' to schedule other tasks
// that are waiting for the when_all 'task' to complete.
// Assume the 'when_all' is complete until a dependence is
// found that is not complete.
task->m_next = lock ;
bool is_complete = true ;
complete( task );
for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
--i ;
// Loop dependences looking for an incomplete task.
// Add this task to the incomplete task's wait queue.
// Remove a task 'x' from the dependence list.
// The reference count of 'x' was incremented when
// it was assigned into the dependence list.
task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
if ( x ) {
// If x->m_wait is not locked then push succeeds
// and the aggregate is not complete.
// If the push succeeds then this when_all 'task' may be
// processed by another thread at any time.
// For example, 'x' may be completeed by another
// thread and then re-schedule this when_all 'task'.
is_complete = ! push_task( & x->m_wait , task );
// Decrement reference count which had been incremented
// when 'x' was added to the dependence list.
TaskQueue::assign( & x , zero );
}
}
if ( is_complete ) {
// The when_all 'task' was not added to a wait queue because
// all dependences were complete so this aggregate is complete.
// Complete the when_all 'task' to schedule other tasks
// that are waiting for the when_all 'task' to complete.
task->m_next = lock ;
complete( task );
// '*task' may have been deleted upon completion
}
// '*task' may have been deleted upon completion
}
//----------------------------------------
// Postcondition:
// A runnable 'task' was pushed into a wait or ready queue.
// An aggregate 'task' was either pushed to a wait queue
// or completed.
// Concurrent execution may have already popped 'task'
// from a queue and processed it as appropriate.
// - An aggregate 'task' was either pushed to a wait queue or completed.
// - Concurrent execution may have already popped 'task'
// from a queue and processed it as appropriate.
}
//----------------------------------------------------------------------------
@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
// Is a runnable task has finished executing and requested respawn.
// Schedule the task for subsequent execution.
schedule( task );
schedule_runnable( task );
}
//----------------------------------------
else {
@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
TaskQueue::assign( & task , zero );
// This thread has exclusive access to the wait list so
// the concurrency-safe pop_task function is not needed.
// the concurrency-safe pop_ready_task function is not needed.
// Schedule the tasks that have been waiting on the input 'task',
// which may have been deleted.
while ( x != end ) {
// Have exclusive access to 'x' until it is scheduled
// Set x->m_next = zero <= no dependence, not a respawn
// Set x->m_next = zero <= no dependence
task_root_type * const next = x->m_next ; x->m_next = 0 ;
task_root_type * const next =
(task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
schedule( x );
if ( task_root_type::Aggregate != x->m_task_type ) {
schedule_runnable( x );
}
else {
schedule_aggregate( x );
}
x = next ;
}

View File

@ -45,6 +45,7 @@
#define KOKKOS_CORE_IMPL_UTILITIES_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <type_traits>
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,52 +36,144 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_BitOps.hpp>
/*--------------------------------------------------------------------------*/
#if ( KOKKOS_ENABLE_ASM )
#if defined( __arm__ ) || defined( __aarch64__ )
/* No-operation instruction to idle the thread. */
#define YIELD asm volatile("nop")
#if !defined( _WIN32 )
#if defined( KOKKOS_ENABLE_ASM )
#if defined( __arm__ ) || defined( __aarch64__ )
/* No-operation instruction to idle the thread. */
#define KOKKOS_INTERNAL_PAUSE
#else
/* Pause instruction to prevent excess processor bus usage */
#define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory")
#endif
#define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0u: KOKKOS_INTERNAL_NOP2; break;
case 1u: KOKKOS_INTERNAL_NOP4; break;
case 2u: KOKKOS_INTERNAL_NOP8; break;
case 3u: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#else
/* Pause instruction to prevent excess processor bus usage */
#define YIELD asm volatile("pause\n":::"memory")
#include <sched.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
sched_yield();
}
}
#endif
#else // defined( _WIN32 )
#if defined ( KOKKOS_ENABLE_WINTHREAD )
#include <process.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
Sleep(0);
}
}
#elif defined( _MSC_VER )
#define NOMINMAX
#include <winsock2.h>
#include <windows.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
YieldProcessor();
}
}
#else
#define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory")
#define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0: KOKKOS_INTERNAL_NOP2; break;
case 1: KOKKOS_INTERNAL_NOP4; break;
case 2: KOKKOS_INTERNAL_NOP8; break;
case 3: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#endif
#elif defined ( KOKKOS_ENABLE_WINTHREAD )
#include <process.h>
#define YIELD Sleep(0)
#elif defined ( _WIN32) && defined (_MSC_VER)
/* Windows w/ Visual Studio */
#define NOMINMAX
#include <winsock2.h>
#include <windows.h>
#define YIELD YieldProcessor();
#elif defined ( _WIN32 )
/* Windows w/ Intel*/
#define YIELD __asm__ __volatile__("pause\n":::"memory")
#else
#include <sched.h>
#define YIELD sched_yield()
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait( volatile int & flag , const int value )
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
YIELD ;
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
#endif
} /* namespace Impl */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -47,14 +47,30 @@
#include <Kokkos_Macros.hpp>
#include <cstdint>
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait( volatile int & flag , const int value );
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
#else
KOKKOS_INLINE_FUNCTION
void spinwait( volatile int & , const int ) {}
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
#endif
} /* namespace Impl */