Update Kokkos library to v2.03.13
This commit is contained in:
@ -141,7 +141,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
#define LOOP_ARGS_8 LOOP_ARGS_7, i7 + m_offset[7]
|
||||
|
||||
|
||||
|
||||
// New Loop Macros...
|
||||
// parallel_for, non-tagged
|
||||
#define APPLY( func, ... ) \
|
||||
@ -1010,8 +1009,6 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
// end tagged macros
|
||||
|
||||
|
||||
|
||||
|
||||
// Structs for calling loops
|
||||
template < int Rank, bool IsLeft, typename IType, typename Tagged, typename Enable = void >
|
||||
struct Tile_Loop_Type;
|
||||
@ -1279,6 +1276,19 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
|
||||
template <typename T>
|
||||
using is_void = std::is_same< T , void >;
|
||||
|
||||
template <typename T>
|
||||
struct is_type_array : std::false_type
|
||||
{
|
||||
using value_type = T;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct is_type_array< T[] > : std::true_type
|
||||
{
|
||||
using value_type = T;
|
||||
};
|
||||
|
||||
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag = void
|
||||
@ -1761,18 +1771,17 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
RP const& m_rp;
|
||||
Functor const& m_func;
|
||||
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
|
||||
// value_type & m_v;
|
||||
|
||||
};
|
||||
|
||||
|
||||
// ValueType: For reductions
|
||||
// For ParallelReduce
|
||||
// ValueType - scalar: For reductions
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value >::type >
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
@ -2251,12 +2260,497 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
};
|
||||
|
||||
|
||||
// For ParallelReduce
|
||||
// Extra specialization for array reductions
|
||||
// ValueType[]: For array reductions
|
||||
template < typename RP
|
||||
, typename Functor
|
||||
, typename Tag
|
||||
, typename ValueType
|
||||
>
|
||||
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
|
||||
{
|
||||
using index_type = typename RP::index_type;
|
||||
using point_type = typename RP::point_type;
|
||||
|
||||
using value_type = typename is_type_array<ValueType>::value_type; // strip away the 'array-ness' [], only underlying type remains
|
||||
|
||||
inline
|
||||
HostIterateTile( RP const& rp, Functor const& func, value_type *v ) // v should be an array; treat as pointer for compatibility since size is not known nor needed here
|
||||
: m_rp(rp) //Cuda 7.0 does not like braces...
|
||||
, m_func(func)
|
||||
, m_v(v) // use with non-void ValueType struct
|
||||
{}
|
||||
|
||||
inline
|
||||
bool check_iteration_bounds( point_type& partial_tile , point_type& offset ) const {
|
||||
bool is_full_tile = true;
|
||||
|
||||
for ( int i = 0; i < RP::rank; ++i ) {
|
||||
if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) {
|
||||
partial_tile[i] = m_rp.m_tile[i] ;
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
|
||||
return is_full_tile ;
|
||||
} // end check bounds
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
};
|
||||
|
||||
|
||||
#if KOKKOS_ENABLE_NEW_LOOP_MACROS
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{ operator_impl( tile_idx , RankTag<RP::rank>() ); }
|
||||
// added due to compiler error when using sfinae to choose operator based on rank
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<2> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_2L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_2L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_2R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_2R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_2 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 2
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<3> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_3L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_3L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_3R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_3R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_3 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 3
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<4> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_4L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_4L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_4R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_4R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_4 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 4
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<5> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_5L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_5L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_5R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_5R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_5 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 5
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<6> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_6L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_6L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_6R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_6R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_6 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 6
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<7> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_7L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_7L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_7R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_7R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_7 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 7
|
||||
|
||||
|
||||
template <typename IType>
|
||||
inline
|
||||
void operator_impl( IType tile_idx , const RankTag<8> ) const
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (int i=RP::rank-1; i>=0; --i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_8L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_8L(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
}
|
||||
} // end RP::Left
|
||||
else {
|
||||
if ( full_tile ) {
|
||||
// #pragma simd
|
||||
LOOP_8R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
} else {
|
||||
// #pragma simd
|
||||
LOOP_8R(index_type, m_tiledims) {
|
||||
apply( LOOP_ARGS_8 );
|
||||
}
|
||||
}
|
||||
} // end RP::Right
|
||||
|
||||
} //end op() rank == 8
|
||||
#endif
|
||||
|
||||
|
||||
template <typename... Args>
|
||||
typename std::enable_if<( sizeof...(Args) == RP::rank && std::is_same<Tag,void>::value), void>::type
|
||||
apply(Args &&... args) const
|
||||
{
|
||||
m_func(args... , m_v);
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
typename std::enable_if<( sizeof...(Args) == RP::rank && !std::is_same<Tag,void>::value), void>::type
|
||||
apply(Args &&... args) const
|
||||
{
|
||||
m_func( m_tag, args... , m_v);
|
||||
}
|
||||
|
||||
|
||||
RP const& m_rp;
|
||||
Functor const& m_func;
|
||||
value_type * m_v;
|
||||
typename std::conditional< std::is_same<Tag,void>::value,int,Tag>::type m_tag;
|
||||
|
||||
};
|
||||
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// MDFunctor - wraps the range_policy and functor to pass to IterateTile
|
||||
// Serial, Threads, OpenMP
|
||||
// Used for md_parallel_{for,reduce} with Serial, Threads, OpenMP
|
||||
// Cuda uses DeviceIterateTile directly within md_parallel_for
|
||||
// ParallelReduce
|
||||
// TODO Once md_parallel_{for,reduce} removed, this can be removed
|
||||
|
||||
// ParallelReduce - scalar reductions
|
||||
template < typename MDRange, typename Functor, typename ValueType = void >
|
||||
struct MDFunctor
|
||||
{
|
||||
@ -2273,7 +2767,7 @@ struct MDFunctor
|
||||
|
||||
|
||||
inline
|
||||
MDFunctor( MDRange const& range, Functor const& f, ValueType & v )
|
||||
MDFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range( range )
|
||||
, m_func( f )
|
||||
{}
|
||||
@ -2290,7 +2784,6 @@ struct MDFunctor
|
||||
inline
|
||||
MDFunctor& operator=( MDFunctor && ) = default;
|
||||
|
||||
// KOKKOS_FORCEINLINE_FUNCTION //Caused cuda warning - __host__ warning
|
||||
inline
|
||||
void operator()(index_type t, value_type & v) const
|
||||
{
|
||||
@ -2301,6 +2794,56 @@ struct MDFunctor
|
||||
Functor m_func;
|
||||
};
|
||||
|
||||
|
||||
// ParallelReduce - array reductions
|
||||
template < typename MDRange, typename Functor, typename ValueType >
|
||||
struct MDFunctor< MDRange, Functor, ValueType[] >
|
||||
{
|
||||
using range_policy = MDRange;
|
||||
using functor_type = Functor;
|
||||
using value_type = ValueType[];
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using index_type = typename range_policy::index_type;
|
||||
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
|
||||
, Functor
|
||||
, work_tag
|
||||
, value_type
|
||||
>;
|
||||
|
||||
|
||||
inline
|
||||
MDFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range( range )
|
||||
, m_func( f )
|
||||
, value_count( f.value_count )
|
||||
{}
|
||||
|
||||
inline
|
||||
MDFunctor( MDFunctor const& ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor& operator=( MDFunctor const& ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor( MDFunctor && ) = default;
|
||||
|
||||
inline
|
||||
MDFunctor& operator=( MDFunctor && ) = default;
|
||||
|
||||
// FIXME Init and Join, as defined in m_func, are not working through the MDFunctor
|
||||
// Best path forward is to eliminate need for MDFunctor, directly use MDRangePolicy within Parallel{For,Reduce} ??
|
||||
inline
|
||||
void operator()(index_type t, value_type v) const
|
||||
{
|
||||
iterate_type(m_range, m_func, v)(t);
|
||||
}
|
||||
|
||||
MDRange m_range;
|
||||
Functor m_func;
|
||||
size_t value_count;
|
||||
};
|
||||
|
||||
|
||||
// ParallelFor
|
||||
template < typename MDRange, typename Functor >
|
||||
struct MDFunctor< MDRange, Functor, void >
|
||||
@ -2349,4 +2892,3 @@ struct MDFunctor< MDRange, Functor, void >
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -55,16 +55,19 @@ template < typename ExecutionSpace = void
|
||||
, typename WorkTag = void
|
||||
, typename IndexType = void
|
||||
, typename IterationPattern = void
|
||||
, typename LaunchBounds = void
|
||||
>
|
||||
struct PolicyTraitsBase
|
||||
{
|
||||
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
|
||||
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType,
|
||||
IterationPattern, LaunchBounds>;
|
||||
|
||||
using execution_space = ExecutionSpace;
|
||||
using schedule_type = Schedule;
|
||||
using work_tag = WorkTag;
|
||||
using index_type = IndexType;
|
||||
using iteration_pattern = IterationPattern;
|
||||
using launch_bounds = LaunchBounds;
|
||||
};
|
||||
|
||||
|
||||
@ -78,6 +81,7 @@ struct SetExecutionSpace
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
@ -91,6 +95,7 @@ struct SetSchedule
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
@ -104,6 +109,7 @@ struct SetWorkTag
|
||||
, WorkTag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
@ -117,6 +123,7 @@ struct SetIndexType
|
||||
, typename PolicyBase::work_tag
|
||||
, IndexType
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
@ -131,6 +138,22 @@ struct SetIterationPattern
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, IterationPattern
|
||||
, typename PolicyBase::launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
||||
template <typename PolicyBase, typename LaunchBounds>
|
||||
struct SetLaunchBounds
|
||||
{
|
||||
static_assert( is_void<typename PolicyBase::launch_bounds>::value
|
||||
, "Kokkos Error: More than one launch_bounds given" );
|
||||
using type = PolicyTraitsBase< typename PolicyBase::execution_space
|
||||
, typename PolicyBase::schedule_type
|
||||
, typename PolicyBase::work_tag
|
||||
, typename PolicyBase::index_type
|
||||
, typename PolicyBase::iteration_pattern
|
||||
, LaunchBounds
|
||||
>;
|
||||
};
|
||||
|
||||
@ -146,8 +169,9 @@ struct AnalyzePolicy<Base, T, Traits...> : public
|
||||
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
|
||||
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
|
||||
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
|
||||
, typename std::conditional< is_launch_bounds<T>::value , SetLaunchBounds<Base,T>
|
||||
, SetWorkTag<Base,T>
|
||||
>::type >::type >::type >::type>::type::type
|
||||
>::type >::type >::type >::type >::type>::type::type
|
||||
, Traits...
|
||||
>
|
||||
{};
|
||||
@ -178,11 +202,18 @@ struct AnalyzePolicy<Base>
|
||||
, void // TODO set default iteration pattern
|
||||
, typename Base::iteration_pattern
|
||||
>::type;
|
||||
|
||||
using launch_bounds = typename std::conditional< is_void< typename Base::launch_bounds >::value
|
||||
, LaunchBounds<>
|
||||
, typename Base::launch_bounds
|
||||
>::type;
|
||||
|
||||
using type = PolicyTraitsBase< execution_space
|
||||
, schedule_type
|
||||
, work_tag
|
||||
, index_type
|
||||
, iteration_pattern
|
||||
, launch_bounds
|
||||
>;
|
||||
};
|
||||
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
|
||||
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
|
||||
@ -126,11 +130,21 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare ,
|
||||
|
||||
inline
|
||||
int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
|
||||
{ return __sync_val_compare_and_swap(dest,compare,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_val_compare_and_swap(dest,compare,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
|
||||
{ return __sync_val_compare_and_swap(dest,compare,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_val_compare_and_swap(dest,compare,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
@ -159,6 +173,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
@ -175,6 +193,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
@ -193,6 +215,10 @@ T atomic_compare_exchange( volatile T * const dest, const T & compare,
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
|
||||
return tmp.t ;
|
||||
}
|
||||
@ -209,6 +235,10 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
|
||||
#endif
|
||||
, const T >::type& val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
if( return_val == compare ) {
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_DECREMENT_HPP
|
||||
@ -54,6 +58,10 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<char>(volatile char* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decb %0"
|
||||
: /* no output registers */
|
||||
@ -69,6 +77,10 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<short>(volatile short* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decw %0"
|
||||
: /* no output registers */
|
||||
@ -84,6 +96,10 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<int>(volatile int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock decl %0"
|
||||
: /* no output registers */
|
||||
@ -99,6 +115,9 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_decrement<long long int>(volatile long long int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock decq %0"
|
||||
: /* no output registers */
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
|
||||
#define KOKKOS_ATOMIC_EXCHANGE_HPP
|
||||
@ -81,6 +85,10 @@ T atomic_exchange(
|
||||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
|
||||
{
|
||||
// int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
|
||||
return *((T*)&tmp);
|
||||
}
|
||||
@ -93,6 +101,11 @@ T atomic_exchange(
|
||||
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
|
||||
{
|
||||
typedef unsigned long long int type ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
// type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
|
||||
type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
|
||||
return *((T*)&tmp);
|
||||
@ -108,6 +121,10 @@ T atomic_exchange( volatile T * const dest ,
|
||||
{
|
||||
T return_val;
|
||||
// This is a way to (hopefully) avoid dead lock in a warp
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
int done = 0;
|
||||
unsigned int active = __ballot(1);
|
||||
unsigned int done_active = 0;
|
||||
@ -173,6 +190,9 @@ T atomic_exchange( volatile T * const dest ,
|
||||
, const T & >::type val )
|
||||
{
|
||||
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
const type v = *((type*)&val); // Extract to be sure the value doesn't change
|
||||
|
||||
@ -201,6 +221,10 @@ T atomic_exchange( volatile T * const dest ,
|
||||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
|
||||
, const T & >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union U {
|
||||
Impl::cas128_t i ;
|
||||
T t ;
|
||||
@ -260,6 +284,10 @@ void atomic_assign( volatile T * const dest ,
|
||||
{
|
||||
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
const type v = *((type*)&val); // Extract to be sure the value doesn't change
|
||||
|
||||
type assumed ;
|
||||
@ -285,6 +313,10 @@ void atomic_assign( volatile T * const dest ,
|
||||
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
|
||||
, const T & >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union U {
|
||||
Impl::cas128_t i ;
|
||||
T t ;
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
|
||||
@ -161,36 +165,60 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
inline
|
||||
int atomic_fetch_add( volatile int * dest , const int val )
|
||||
{
|
||||
int original = val;
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock xadd %1, %0"
|
||||
: "+m" (*dest), "+r" (original)
|
||||
: "m" (*dest), "r" (original)
|
||||
: "memory"
|
||||
int original = val;
|
||||
|
||||
__asm__ __volatile__(
|
||||
"lock xadd %1, %0"
|
||||
: "+m" (*dest), "+r" (original)
|
||||
: "m" (*dest), "r" (original)
|
||||
: "memory"
|
||||
);
|
||||
|
||||
return original;
|
||||
return original;
|
||||
}
|
||||
#else
|
||||
inline
|
||||
int atomic_fetch_add( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_add(dest, val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest, val);
|
||||
}
|
||||
#endif
|
||||
|
||||
inline
|
||||
long int atomic_fetch_add( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_add(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_add(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -205,6 +233,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
@ -228,6 +260,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
@ -253,6 +289,10 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
inline U() {};
|
||||
} assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_AND_HPP
|
||||
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_and( volatile unsigned long long int * const
|
||||
|
||||
inline
|
||||
int atomic_fetch_and( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_and( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_and(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_and(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_OR_HPP
|
||||
@ -76,21 +80,41 @@ unsigned long long int atomic_fetch_or( volatile unsigned long long int * const
|
||||
|
||||
inline
|
||||
int atomic_fetch_or( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_or( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_or(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_or(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
|
||||
@ -136,21 +140,41 @@ T atomic_fetch_sub( volatile T * const dest ,
|
||||
|
||||
inline
|
||||
int atomic_fetch_sub( volatile int * const dest , const int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
long int atomic_fetch_sub( volatile long int * const dest , const long int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_GNU_ATOMICS )
|
||||
|
||||
inline
|
||||
unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
inline
|
||||
unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
|
||||
{ return __sync_fetch_and_sub(dest,val); }
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
return __sync_fetch_and_sub(dest,val);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -161,6 +185,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
||||
{
|
||||
union { int i ; T t ; } assume , oldval , newval ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
oldval.t = *dest ;
|
||||
|
||||
do {
|
||||
@ -178,6 +206,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
||||
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
|
||||
sizeof(T) == sizeof(long) , const T >::type val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
union { long i ; T t ; } assume , oldval , newval ;
|
||||
|
||||
oldval.t = *dest ;
|
||||
@ -202,6 +234,10 @@ T atomic_fetch_sub( volatile T * const dest ,
|
||||
&& ( sizeof(T) != 8 )
|
||||
, const T >::type& val )
|
||||
{
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) dest, _MM_HINT_ET0 );
|
||||
#endif
|
||||
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
*dest = return_val - val;
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_INCREMENT_HPP
|
||||
@ -52,6 +56,9 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<char>(volatile char* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incb %0"
|
||||
: /* no output registers */
|
||||
@ -67,6 +74,9 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<short>(volatile short* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incw %0"
|
||||
: /* no output registers */
|
||||
@ -82,6 +92,9 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<int>(volatile int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incl %0"
|
||||
: /* no output registers */
|
||||
@ -97,6 +110,9 @@ template<>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void atomic_increment<long long int>(volatile long long int* a) {
|
||||
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) && ! defined(_WIN32) && ! defined(__CUDA_ARCH__)
|
||||
#if defined( KOKKOS_ENABLE_RFO_PREFETCH )
|
||||
_mm_prefetch( (const char*) a, _MM_HINT_ET0 );
|
||||
#endif
|
||||
__asm__ __volatile__(
|
||||
"lock incq %0"
|
||||
: /* no output registers */
|
||||
|
||||
@ -87,17 +87,12 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
|
||||
if(num_threads>0) {
|
||||
if(use_numa>0) {
|
||||
Kokkos::OpenMP::initialize(num_threads,use_numa);
|
||||
}
|
||||
else {
|
||||
Kokkos::OpenMP::initialize(num_threads);
|
||||
}
|
||||
} else {
|
||||
Kokkos::OpenMP::initialize();
|
||||
if(use_numa>0) {
|
||||
Kokkos::OpenMP::initialize(num_threads,use_numa);
|
||||
}
|
||||
else {
|
||||
Kokkos::OpenMP::initialize(num_threads);
|
||||
}
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
|
||||
}
|
||||
else {
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
|
||||
@ -437,10 +432,7 @@ void initialize(int& narg, char* arg[])
|
||||
iarg++;
|
||||
}
|
||||
|
||||
InitArguments arguments;
|
||||
arguments.num_threads = num_threads;
|
||||
arguments.num_numa = numa;
|
||||
arguments.device_id = device;
|
||||
InitArguments arguments{num_threads, numa, device};
|
||||
Impl::initialize_internal(arguments);
|
||||
}
|
||||
|
||||
|
||||
@ -170,28 +170,31 @@ struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType:
|
||||
static_assert( 0 == ( sizeof(value_type) % sizeof(int) ) ,
|
||||
"Reduction functor's declared value_type requires: 0 == sizeof(value_type) % sizeof(int)" );
|
||||
|
||||
/* this cast to bool is needed for correctness by NVCC */
|
||||
enum : bool { IsArray = static_cast<bool>(Impl::is_array< typename FunctorType::value_type >::value) };
|
||||
|
||||
// If not an array then what is the sizeof(value_type)
|
||||
enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
|
||||
enum { StaticValueSize = IsArray ? 0 : sizeof(value_type) };
|
||||
|
||||
typedef value_type * pointer_type ;
|
||||
|
||||
// The reference_type for an array is 'value_type *'
|
||||
// The reference_type for a single value is 'value_type &'
|
||||
|
||||
typedef typename Impl::if_c< ! StaticValueSize , value_type *
|
||||
, value_type & >::type reference_type ;
|
||||
typedef typename Impl::if_c< IsArray , value_type *
|
||||
, value_type & >::type reference_type ;
|
||||
|
||||
// Number of values if single value
|
||||
template< class F >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! IsArray , unsigned >::type
|
||||
value_count( const F & ) { return 1 ; }
|
||||
|
||||
// Number of values if an array, protect via templating because 'f.value_count'
|
||||
// will only exist when the functor declares the value_type to be an array.
|
||||
template< class F >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
|
||||
typename Impl::enable_if< std::is_same<F,FunctorType>::value && IsArray , unsigned >::type
|
||||
value_count( const F & f ) { return f.value_count ; }
|
||||
|
||||
// Total size of the value
|
||||
|
||||
@ -70,62 +70,6 @@
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace {
|
||||
|
||||
static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
|
||||
|
||||
typedef int (* QuerySpaceInParallelPtr )();
|
||||
|
||||
QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
|
||||
int s_in_parallel_query_count = 0 ;
|
||||
|
||||
} // namespace <empty>
|
||||
|
||||
void HBWSpace::register_in_parallel( int (*device_in_parallel)() )
|
||||
{
|
||||
if ( 0 == device_in_parallel ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel ERROR : given NULL" ) );
|
||||
}
|
||||
|
||||
int i = -1 ;
|
||||
|
||||
if ( ! (device_in_parallel)() ) {
|
||||
for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
|
||||
}
|
||||
|
||||
if ( i < s_in_parallel_query_count ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : called in_parallel" ) );
|
||||
|
||||
}
|
||||
|
||||
if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::HBWSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
|
||||
|
||||
}
|
||||
|
||||
for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
|
||||
|
||||
if ( i == s_in_parallel_query_count ) {
|
||||
s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
|
||||
}
|
||||
}
|
||||
|
||||
int HBWSpace::in_parallel()
|
||||
{
|
||||
const int n = s_in_parallel_query_count ;
|
||||
|
||||
int i = 0 ;
|
||||
|
||||
while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
|
||||
|
||||
return i < n ;
|
||||
}
|
||||
|
||||
} // namespace Experiemtal
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -106,62 +106,6 @@
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
|
||||
static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
|
||||
|
||||
typedef int (* QuerySpaceInParallelPtr )();
|
||||
|
||||
QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
|
||||
int s_in_parallel_query_count = 0 ;
|
||||
|
||||
} // namespace <empty>
|
||||
|
||||
void HostSpace::register_in_parallel( int (*device_in_parallel)() )
|
||||
{
|
||||
if ( 0 == device_in_parallel ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
|
||||
}
|
||||
|
||||
int i = -1 ;
|
||||
|
||||
if ( ! (device_in_parallel)() ) {
|
||||
for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
|
||||
}
|
||||
|
||||
if ( i < s_in_parallel_query_count ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
|
||||
|
||||
}
|
||||
|
||||
if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
|
||||
|
||||
}
|
||||
|
||||
for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
|
||||
|
||||
if ( i == s_in_parallel_query_count ) {
|
||||
s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
|
||||
}
|
||||
}
|
||||
|
||||
int HostSpace::in_parallel()
|
||||
{
|
||||
const int n = s_in_parallel_query_count ;
|
||||
|
||||
int i = 0 ;
|
||||
|
||||
while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
|
||||
|
||||
return i < n ;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/* Default allocation mechanism */
|
||||
@ -340,9 +284,6 @@ void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_
|
||||
}
|
||||
}
|
||||
|
||||
constexpr const char* HostSpace::name() {
|
||||
return m_name;
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -58,9 +58,11 @@ void HostThreadTeamData::organize_pool
|
||||
{
|
||||
bool ok = true ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
// Verify not already a member of a pool:
|
||||
for ( int rank = 0 ; rank < size && ok ; ++rank ) {
|
||||
ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
|
||||
ok = ( nullptr != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
|
||||
}
|
||||
|
||||
if ( ok ) {
|
||||
@ -89,7 +91,6 @@ void HostThreadTeamData::organize_pool
|
||||
mem->m_team_alloc = 1 ;
|
||||
mem->m_league_rank = rank ;
|
||||
mem->m_league_size = size ;
|
||||
mem->m_pool_rendezvous_step = 0 ;
|
||||
mem->m_team_rendezvous_step = 0 ;
|
||||
pool[ rank ] = mem ;
|
||||
}
|
||||
@ -116,7 +117,6 @@ void HostThreadTeamData::disband_pool()
|
||||
m_team_alloc = 1 ;
|
||||
m_league_rank = 0 ;
|
||||
m_league_size = 1 ;
|
||||
m_pool_rendezvous_step = 0 ;
|
||||
m_team_rendezvous_step = 0 ;
|
||||
}
|
||||
|
||||
@ -256,11 +256,6 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
|
||||
|
||||
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
|
||||
|
||||
union {
|
||||
int64_t full ;
|
||||
int8_t byte[8] ;
|
||||
} value ;
|
||||
|
||||
if ( rank ) {
|
||||
|
||||
const int group_begin = rank << shift_byte ; // == rank * size_byte
|
||||
@ -275,13 +270,14 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
|
||||
const int end = group_begin + size_byte < size
|
||||
? size_byte : size - group_begin ;
|
||||
|
||||
value.full = 0 ;
|
||||
for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
|
||||
int64_t value = 0 ;
|
||||
|
||||
store_fence(); // This should not be needed but fixes #742
|
||||
for ( int i = 0 ; i < end ; ++i ) {
|
||||
((int8_t*) & value )[i] = int8_t( step );
|
||||
}
|
||||
|
||||
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
|
||||
, value.full );
|
||||
, value );
|
||||
}
|
||||
|
||||
{
|
||||
@ -316,10 +312,12 @@ int HostThreadTeamData::rendezvous( int64_t * const buffer
|
||||
|
||||
const int end = size_byte < size ? 8 : size ;
|
||||
|
||||
value.full = 0 ;
|
||||
for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
|
||||
int64_t value = 0 ;
|
||||
for ( int i = 1 ; i < end ; ++i ) {
|
||||
((int8_t *) & value)[i] = int8_t( step );
|
||||
}
|
||||
|
||||
spinwait_until_equal( buffer[ sync_offset ], value.full );
|
||||
spinwait_until_equal( buffer[ sync_offset ], value );
|
||||
}
|
||||
|
||||
return rank ? 0 : 1 ;
|
||||
|
||||
@ -50,6 +50,7 @@
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_FunctorAnalysis.hpp>
|
||||
#include <impl/Kokkos_Rendezvous.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -67,14 +68,12 @@ public:
|
||||
|
||||
// Assume upper bounds on number of threads:
|
||||
// pool size <= 1024 threads
|
||||
// pool rendezvous <= ( 1024 / 8 ) * 4 + 4 = 2052
|
||||
// team size <= 64 threads
|
||||
// team rendezvous <= ( 64 / 8 ) * 4 + 4 = 36
|
||||
|
||||
enum : int { max_pool_members = 1024 };
|
||||
enum : int { max_team_members = 64 };
|
||||
enum : int { max_pool_rendezvous = ( max_pool_members / 8 ) * 4 + 4 };
|
||||
enum : int { max_team_rendezvous = ( max_team_members / 8 ) * 4 + 4 };
|
||||
enum : int { max_pool_rendezvous = rendezvous_buffer_size( max_pool_members ) };
|
||||
enum : int { max_team_rendezvous = rendezvous_buffer_size( max_team_members ) };
|
||||
|
||||
private:
|
||||
|
||||
@ -114,7 +113,6 @@ private:
|
||||
int m_league_size ;
|
||||
int m_work_chunk ;
|
||||
int m_steal_rank ; // work stealing rank
|
||||
int mutable m_pool_rendezvous_step ;
|
||||
int mutable m_team_rendezvous_step ;
|
||||
|
||||
HostThreadTeamData * team_member( int r ) const noexcept
|
||||
@ -147,6 +145,7 @@ public:
|
||||
int team_rendezvous( int const root ) const noexcept
|
||||
{
|
||||
return 1 == m_team_size ? 1 :
|
||||
HostThreadTeamData::
|
||||
rendezvous( m_team_scratch + m_team_rendezvous
|
||||
, m_team_rendezvous_step
|
||||
, m_team_size
|
||||
@ -157,6 +156,7 @@ public:
|
||||
int team_rendezvous() const noexcept
|
||||
{
|
||||
return 1 == m_team_size ? 1 :
|
||||
HostThreadTeamData::
|
||||
rendezvous( m_team_scratch + m_team_rendezvous
|
||||
, m_team_rendezvous_step
|
||||
, m_team_size
|
||||
@ -167,6 +167,7 @@ public:
|
||||
void team_rendezvous_release() const noexcept
|
||||
{
|
||||
if ( 1 < m_team_size ) {
|
||||
HostThreadTeamData::
|
||||
rendezvous_release( m_team_scratch + m_team_rendezvous
|
||||
, m_team_rendezvous_step );
|
||||
}
|
||||
@ -175,19 +176,30 @@ public:
|
||||
inline
|
||||
int pool_rendezvous() const noexcept
|
||||
{
|
||||
static constexpr int yield_wait =
|
||||
#if defined( KOKKOS_COMPILER_IBM )
|
||||
// If running on IBM POWER architecture the global
|
||||
// level rendzvous should immediately yield when
|
||||
// waiting for other threads in the pool to arrive.
|
||||
1
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
;
|
||||
return 1 == m_pool_size ? 1 :
|
||||
Kokkos::Impl::
|
||||
rendezvous( m_pool_scratch + m_pool_rendezvous
|
||||
, m_pool_rendezvous_step
|
||||
, m_pool_size
|
||||
, m_pool_rank );
|
||||
, m_pool_rank
|
||||
, yield_wait );
|
||||
}
|
||||
|
||||
inline
|
||||
void pool_rendezvous_release() const noexcept
|
||||
{
|
||||
if ( 1 < m_pool_size ) {
|
||||
rendezvous_release( m_pool_scratch + m_pool_rendezvous
|
||||
, m_pool_rendezvous_step );
|
||||
Kokkos::Impl::
|
||||
rendezvous_release( m_pool_scratch + m_pool_rendezvous );
|
||||
}
|
||||
}
|
||||
|
||||
@ -213,7 +225,6 @@ public:
|
||||
, m_league_size(1)
|
||||
, m_work_chunk(0)
|
||||
, m_steal_rank(0)
|
||||
, m_pool_rendezvous_step(0)
|
||||
, m_team_rendezvous_step(0)
|
||||
{}
|
||||
|
||||
@ -406,7 +417,7 @@ fflush(stdout);
|
||||
// Steal from next team, round robin
|
||||
// The next team is offset by m_team_alloc if it fits in the pool.
|
||||
|
||||
m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
|
||||
m_steal_rank = m_team_base + m_team_alloc + m_team_size <= m_pool_size ?
|
||||
m_team_base + m_team_alloc : 0 ;
|
||||
}
|
||||
|
||||
|
||||
@ -50,51 +50,70 @@
|
||||
namespace Kokkos {
|
||||
namespace Profiling {
|
||||
|
||||
static initFunction initProfileLibrary = nullptr;
|
||||
static finalizeFunction finalizeProfileLibrary = nullptr;
|
||||
|
||||
static beginFunction beginForCallee = nullptr;
|
||||
static beginFunction beginScanCallee = nullptr;
|
||||
static beginFunction beginReduceCallee = nullptr;
|
||||
static endFunction endForCallee = nullptr;
|
||||
static endFunction endScanCallee = nullptr;
|
||||
static endFunction endReduceCallee = nullptr;
|
||||
|
||||
static pushFunction pushRegionCallee = nullptr;
|
||||
static popFunction popRegionCallee = nullptr;
|
||||
|
||||
static allocateDataFunction allocateDataCallee = nullptr;
|
||||
static deallocateDataFunction deallocateDataCallee = nullptr;
|
||||
|
||||
static beginDeepCopyFunction beginDeepCopyCallee = nullptr;
|
||||
static endDeepCopyFunction endDeepCopyCallee = nullptr;
|
||||
|
||||
SpaceHandle::SpaceHandle(const char* space_name) {
|
||||
strncpy(name,space_name,64);
|
||||
}
|
||||
|
||||
bool profileLibraryLoaded() {
|
||||
return (NULL != initProfileLibrary);
|
||||
return (nullptr != initProfileLibrary);
|
||||
}
|
||||
|
||||
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginForCallee) {
|
||||
if(nullptr != beginForCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelFor(const uint64_t kernelID) {
|
||||
if(NULL != endForCallee) {
|
||||
if(nullptr != endForCallee) {
|
||||
Kokkos::fence();
|
||||
(*endForCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginScanCallee) {
|
||||
if(nullptr != beginScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelScan(const uint64_t kernelID) {
|
||||
if(NULL != endScanCallee) {
|
||||
if(nullptr != endScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*endScanCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginReduceCallee) {
|
||||
if(nullptr != beginReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelReduce(const uint64_t kernelID) {
|
||||
if(NULL != endReduceCallee) {
|
||||
if(nullptr != endReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*endReduceCallee)(kernelID);
|
||||
}
|
||||
@ -102,31 +121,47 @@ void endParallelReduce(const uint64_t kernelID) {
|
||||
|
||||
|
||||
void pushRegion(const std::string& kName) {
|
||||
if( NULL != pushRegionCallee ) {
|
||||
if( nullptr != pushRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*pushRegionCallee)(kName.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void popRegion() {
|
||||
if( NULL != popRegionCallee ) {
|
||||
if( nullptr != popRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*popRegionCallee)();
|
||||
}
|
||||
}
|
||||
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
if(nullptr != allocateDataCallee) {
|
||||
(*allocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
if(nullptr != deallocateDataCallee) {
|
||||
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
|
||||
const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
|
||||
const uint64_t size) {
|
||||
if(nullptr != beginDeepCopyCallee) {
|
||||
(*beginDeepCopyCallee)(dst_space, dst_label.c_str(), dst_ptr,
|
||||
src_space, src_label.c_str(), src_ptr,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
void endDeepCopy() {
|
||||
if(nullptr != endDeepCopyCallee) {
|
||||
(*endDeepCopyCallee)();
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
|
||||
// Make sure initialize calls happens only once
|
||||
@ -140,7 +175,7 @@ void initialize() {
|
||||
|
||||
// If we do not find a profiling library in the environment then exit
|
||||
// early.
|
||||
if( NULL == envProfileLibrary ) {
|
||||
if( nullptr == envProfileLibrary ) {
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -149,10 +184,10 @@ void initialize() {
|
||||
|
||||
char* profileLibraryName = strtok(envProfileCopy, ";");
|
||||
|
||||
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
|
||||
if( (nullptr != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
|
||||
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
|
||||
|
||||
if(NULL == firstProfileLibrary) {
|
||||
if(nullptr == firstProfileLibrary) {
|
||||
std::cerr << "Error: Unable to load KokkosP library: " <<
|
||||
profileLibraryName << std::endl;
|
||||
} else {
|
||||
@ -191,14 +226,19 @@ void initialize() {
|
||||
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
|
||||
deallocateDataCallee = *((deallocateDataFunction*) &p12);
|
||||
|
||||
auto p13 = dlsym(firstProfileLibrary, "kokkosp_begin_deep_copy");
|
||||
beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13);
|
||||
auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy");
|
||||
endDeepCopyCallee = *((endDeepCopyFunction*) &p14);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if(NULL != initProfileLibrary) {
|
||||
if(nullptr != initProfileLibrary) {
|
||||
(*initProfileLibrary)(0,
|
||||
(uint64_t) KOKKOSP_INTERFACE_VERSION,
|
||||
(uint32_t) 0,
|
||||
NULL);
|
||||
nullptr);
|
||||
}
|
||||
|
||||
free(envProfileCopy);
|
||||
@ -210,28 +250,30 @@ void finalize() {
|
||||
if(is_finalized) return;
|
||||
is_finalized = 1;
|
||||
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
if(nullptr != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// Set all profile hooks to nullptr to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
initProfileLibrary = nullptr;
|
||||
finalizeProfileLibrary = nullptr;
|
||||
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
beginForCallee = nullptr;
|
||||
beginScanCallee = nullptr;
|
||||
beginReduceCallee = nullptr;
|
||||
endScanCallee = nullptr;
|
||||
endForCallee = nullptr;
|
||||
endReduceCallee = nullptr;
|
||||
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
pushRegionCallee = nullptr;
|
||||
popRegionCallee = nullptr;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
allocateDataCallee = nullptr;
|
||||
deallocateDataCallee = nullptr;
|
||||
|
||||
beginDeepCopyCallee = nullptr;
|
||||
endDeepCopyCallee = nullptr;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -81,23 +81,11 @@ typedef void (*popFunction)();
|
||||
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
|
||||
|
||||
static initFunction initProfileLibrary = NULL;
|
||||
static finalizeFunction finalizeProfileLibrary = NULL;
|
||||
|
||||
static beginFunction beginForCallee = NULL;
|
||||
static beginFunction beginScanCallee = NULL;
|
||||
static beginFunction beginReduceCallee = NULL;
|
||||
static endFunction endForCallee = NULL;
|
||||
static endFunction endScanCallee = NULL;
|
||||
static endFunction endReduceCallee = NULL;
|
||||
|
||||
static pushFunction pushRegionCallee = NULL;
|
||||
static popFunction popRegionCallee = NULL;
|
||||
|
||||
static allocateDataFunction allocateDataCallee = NULL;
|
||||
static deallocateDataFunction deallocateDataCallee = NULL;
|
||||
|
||||
typedef void (*beginDeepCopyFunction)(
|
||||
SpaceHandle, const char*, const void*,
|
||||
SpaceHandle, const char*, const void*,
|
||||
uint64_t);
|
||||
typedef void (*endDeepCopyFunction)();
|
||||
|
||||
bool profileLibraryLoaded();
|
||||
|
||||
@ -114,35 +102,14 @@ void popRegion();
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
|
||||
void beginDeepCopy(const SpaceHandle dst_space, const std::string dst_label, const void* dst_ptr,
|
||||
const SpaceHandle src_space, const std::string src_label, const void* src_ptr,
|
||||
const uint64_t size);
|
||||
void endDeepCopy();
|
||||
|
||||
void initialize();
|
||||
void finalize();
|
||||
|
||||
//Define finalize_fake inline to get rid of warnings for unused static variables
|
||||
inline void finalize_fake() {
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
208
lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
Normal file
208
lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp
Normal file
@ -0,0 +1,208 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_Rendezvous.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* pattern for rendezvous
|
||||
*
|
||||
* if ( rendezvous() ) {
|
||||
* ... all other threads are still in team_rendezvous() ...
|
||||
* rendezvous_release();
|
||||
* ... all other threads are released from team_rendezvous() ...
|
||||
* }
|
||||
*/
|
||||
|
||||
int rendezvous( volatile int64_t * const buffer
|
||||
, int const size
|
||||
, int const rank
|
||||
, int const slow
|
||||
) noexcept
|
||||
{
|
||||
enum : int { shift_byte = 3 };
|
||||
enum : int { size_byte = ( 01 << shift_byte ) }; // == 8
|
||||
enum : int { mask_byte = size_byte - 1 };
|
||||
|
||||
enum : int { shift_mem_cycle = 2 };
|
||||
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
|
||||
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
|
||||
|
||||
// Cycle step values: 1 <= step <= size_val_cycle
|
||||
// An odd multiple of memory cycle so that when a memory location
|
||||
// is reused it has a different value.
|
||||
// Must be representable within a single byte: size_val_cycle < 16
|
||||
|
||||
enum : int { size_val_cycle = 3 * size_mem_cycle };
|
||||
|
||||
// Requires:
|
||||
// Called by rank = [ 0 .. size )
|
||||
// buffer aligned to int64_t[4]
|
||||
|
||||
// A sequence of rendezvous uses four cycled locations in memory
|
||||
// and non-equal cycled synchronization values to
|
||||
// 1) prevent rendezvous from overtaking one another and
|
||||
// 2) give each spin wait location an int64_t[4] span
|
||||
// so that it has its own cache line.
|
||||
|
||||
const int64_t step = (buffer[0] % size_val_cycle ) + 1 ;
|
||||
|
||||
// The leading int64_t[4] span is for thread 0 to write
|
||||
// and all other threads to read spin-wait.
|
||||
// sync_offset is the index into this array for this step.
|
||||
|
||||
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ;
|
||||
|
||||
if ( rank ) {
|
||||
|
||||
const int group_begin = rank << shift_byte ; // == rank * size_byte
|
||||
|
||||
if ( group_begin < size ) {
|
||||
|
||||
// This thread waits for threads
|
||||
// [ group_begin .. group_begin + 8 )
|
||||
// [ rank*8 .. rank*8 + 8 )
|
||||
// to write to their designated bytes.
|
||||
|
||||
const int end = group_begin + size_byte < size
|
||||
? size_byte : size - group_begin ;
|
||||
|
||||
int64_t value = 0;
|
||||
for ( int i = 0 ; i < end ; ++i ) {
|
||||
value |= step << (i * size_byte );
|
||||
}
|
||||
|
||||
store_fence(); // This should not be needed but fixes #742
|
||||
|
||||
if ( slow ) {
|
||||
yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
|
||||
, value );
|
||||
}
|
||||
else {
|
||||
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
|
||||
, value );
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// This thread sets its designated byte.
|
||||
// ( rank % size_byte ) +
|
||||
// ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
|
||||
// ( sync_offset * size_byte )
|
||||
const int offset = ( rank & mask_byte )
|
||||
+ ( ( rank & ~mask_byte ) << shift_mem_cycle )
|
||||
+ ( sync_offset << shift_byte );
|
||||
|
||||
// All of this thread's previous memory stores must be complete before
|
||||
// this thread stores the step value at this thread's designated byte
|
||||
// in the shared synchronization array.
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
((volatile int8_t*) buffer)[ offset ] = int8_t( step );
|
||||
|
||||
// Memory fence to push the previous store out
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
// Wait for thread 0 to release all other threads
|
||||
|
||||
if ( slow ) {
|
||||
yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
|
||||
}
|
||||
else {
|
||||
spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Thread 0 waits for threads [1..7]
|
||||
// to write to their designated bytes.
|
||||
|
||||
const int end = size_byte < size ? 8 : size ;
|
||||
|
||||
int64_t value = 0;
|
||||
for ( int i = 1 ; i < end ; ++i ) {
|
||||
value |= step << (i * size_byte );
|
||||
}
|
||||
|
||||
if ( slow ) {
|
||||
yield_until_equal( buffer[ sync_offset ], value );
|
||||
}
|
||||
else {
|
||||
spinwait_until_equal( buffer[ sync_offset ], value );
|
||||
}
|
||||
}
|
||||
|
||||
return rank ? 0 : 1 ;
|
||||
}
|
||||
|
||||
void rendezvous_release( volatile int64_t * const buffer ) noexcept
|
||||
{
|
||||
enum : int { shift_mem_cycle = 2 };
|
||||
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
|
||||
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
|
||||
enum : int { size_val_cycle = 3 * size_mem_cycle };
|
||||
|
||||
// Requires:
|
||||
// Called after team_rendezvous
|
||||
// Called only by true == team_rendezvous(root)
|
||||
|
||||
// update step
|
||||
const int64_t step = (buffer[0] % size_val_cycle ) + 1;
|
||||
buffer[0] = step;
|
||||
|
||||
// Memory fence to be sure all previous writes are complete:
|
||||
Kokkos::memory_fence();
|
||||
|
||||
buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step;
|
||||
|
||||
// Memory fence to push the store out
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
87
lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
Normal file
87
lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp
Normal file
@ -0,0 +1,87 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP
|
||||
#define KOKKOS_IMPL_RENDEZVOUS_HPP
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
inline
|
||||
constexpr int rendezvous_buffer_size( int max_members ) noexcept
|
||||
{
|
||||
return (((max_members + 7) / 8) * 4) + 4 + 4;
|
||||
}
|
||||
|
||||
/** \brief Thread pool rendezvous
|
||||
*
|
||||
* Rendezvous pattern:
|
||||
* if ( rendezvous(root) ) {
|
||||
* ... only root thread here while all others wait ...
|
||||
* rendezvous_release();
|
||||
* }
|
||||
* else {
|
||||
* ... all other threads release here ...
|
||||
* }
|
||||
*
|
||||
* Requires: buffer[ rendezvous_buffer_size( max_threads ) ];
|
||||
*
|
||||
* When slow != 0 the expectation is thread arrival will be
|
||||
* slow so the threads that arrive early should quickly yield
|
||||
* their core to the runtime thus possibly allowing the late
|
||||
* arriving threads to have more resources
|
||||
* (e.g., power and clock frequency).
|
||||
*/
|
||||
int rendezvous( volatile int64_t * const buffer
|
||||
, int const size
|
||||
, int const rank
|
||||
, int const slow = 0 ) noexcept ;
|
||||
|
||||
void rendezvous_release( volatile int64_t * const buffer ) noexcept ;
|
||||
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif // KOKKOS_IMPL_RENDEZVOUS_HPP
|
||||
|
||||
@ -50,6 +50,7 @@
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#include <impl/Kokkos_SharedAlloc.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -123,7 +124,6 @@ void serial_resize_thread_team_data( size_t pool_reduce_bytes
|
||||
}
|
||||
}
|
||||
|
||||
// Get thread team data structure for omp_get_thread_num()
|
||||
HostThreadTeamData * serial_get_thread_team_data()
|
||||
{
|
||||
return & g_serial_thread_team_data ;
|
||||
@ -151,6 +151,8 @@ void Serial::initialize( unsigned threads_count
|
||||
(void) use_cores_per_numa;
|
||||
(void) allow_asynchronous_threadpool;
|
||||
|
||||
Impl::SharedAllocationRecord< void, void >::tracking_enable();
|
||||
|
||||
// Init the array of locks used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
||||
@ -62,7 +62,7 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
|
||||
{
|
||||
using execution_space = Kokkos::Serial ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
@ -122,7 +122,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
|
||||
{
|
||||
using execution_space = Kokkos::Serial ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using task_root_type = TaskBase< void , void , void > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
@ -65,7 +65,7 @@ public:
|
||||
using execution_space = Kokkos::Serial ;
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< void , void , void > ;
|
||||
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
static
|
||||
|
||||
102
lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
Normal file
102
lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp
Normal file
@ -0,0 +1,102 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
|
||||
#define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType ,
|
||||
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
|
||||
Kokkos::Serial
|
||||
>
|
||||
: public Kokkos::Impl::Experimental::
|
||||
WorkGraphExec< FunctorType,
|
||||
Kokkos::Serial,
|
||||
Traits ...
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
|
||||
typedef Kokkos::Impl::Experimental::
|
||||
WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
Base::m_functor( i );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_one(const typename Policy::member_type& i) const {
|
||||
const TagType t{} ;
|
||||
Base::m_functor( t , i );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute()
|
||||
{
|
||||
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
|
||||
exec_one< typename Policy::work_tag >( i );
|
||||
Base::after_work(i);
|
||||
}
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: Base( arg_functor, arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_SERIAL_WORKGRAPHPOLICY_HPP */
|
||||
@ -46,23 +46,23 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
|
||||
namespace {
|
||||
|
||||
void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
|
||||
{
|
||||
// A host thread claim and disable tracking flag
|
||||
__thread int t_tracking_enabled = 1;
|
||||
|
||||
while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< void , void >::tracking_release_and_enable()
|
||||
{
|
||||
// The host thread that claimed and disabled the tracking flag
|
||||
// now release and enable tracking.
|
||||
int SharedAllocationRecord< void , void >::tracking_enabled()
|
||||
{ return t_tracking_enabled; }
|
||||
|
||||
if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
|
||||
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
|
||||
}
|
||||
void SharedAllocationRecord< void , void >::tracking_disable()
|
||||
{
|
||||
t_tracking_enabled = 0;
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< void , void >::tracking_enable()
|
||||
{
|
||||
t_tracking_enabled = 1;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -71,6 +71,9 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
const SharedAllocationHeader * get_header( void * alloc_ptr )
|
||||
{ return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const char* label() const { return m_label; }
|
||||
};
|
||||
|
||||
template<>
|
||||
@ -83,8 +86,6 @@ protected:
|
||||
|
||||
typedef void (* function_type )( SharedAllocationRecord<void,void> * );
|
||||
|
||||
static int s_tracking_enabled ;
|
||||
|
||||
SharedAllocationHeader * const m_alloc_ptr ;
|
||||
size_t const m_alloc_size ;
|
||||
function_type const m_dealloc ;
|
||||
@ -110,17 +111,17 @@ protected:
|
||||
public:
|
||||
inline std::string get_label() const { return std::string("Unmanaged"); }
|
||||
|
||||
static int tracking_enabled() { return s_tracking_enabled ; }
|
||||
static int tracking_enabled();
|
||||
|
||||
/**\brief A host process thread claims and disables the
|
||||
* shared allocation tracking flag.
|
||||
*/
|
||||
static void tracking_claim_and_disable();
|
||||
static void tracking_disable();
|
||||
|
||||
/**\brief A host process thread releases and enables the
|
||||
* shared allocation tracking flag.
|
||||
*/
|
||||
static void tracking_release_and_enable();
|
||||
static void tracking_enable();
|
||||
|
||||
~SharedAllocationRecord() = default ;
|
||||
|
||||
@ -317,6 +318,11 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool has_record() const {
|
||||
return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
~SharedAllocationTracker()
|
||||
{ KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }
|
||||
|
||||
210
lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
Normal file
210
lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp
Normal file
@ -0,0 +1,210 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_Spinwait.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
#include <thread>
|
||||
#elif !defined( _WIN32 )
|
||||
#include <sched.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
#include <process.h>
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
void host_thread_yield( const uint32_t i , const int force_yield )
|
||||
{
|
||||
static constexpr uint32_t sleep_limit = 1 << 13 ;
|
||||
static constexpr uint32_t yield_limit = 1 << 12 ;
|
||||
|
||||
const int c = Kokkos::Impl::bit_scan_reverse(i);
|
||||
|
||||
if ( sleep_limit < i ) {
|
||||
|
||||
// Attempt to put the thread to sleep for 'c' milliseconds
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
|
||||
#elif !defined( _WIN32 )
|
||||
timespec req ;
|
||||
req.tv_sec = 0 ;
|
||||
req.tv_nsec = 1000 * c ;
|
||||
nanosleep( &req, nullptr );
|
||||
#else /* defined( _WIN32 ) IS Microsoft Windows */
|
||||
Sleep(c);
|
||||
#endif
|
||||
}
|
||||
|
||||
else if ( force_yield || yield_limit < i ) {
|
||||
|
||||
// Attempt to yield thread resources to runtime
|
||||
|
||||
#if defined( KOKKOS_ENABLE_STDTHREAD )
|
||||
std::this_thread::yield();
|
||||
#elif !defined( _WIN32 )
|
||||
sched_yield();
|
||||
#else /* defined( _WIN32 ) IS Microsoft Windows */
|
||||
YieldProcessor();
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ENABLE_ASM )
|
||||
|
||||
else if ( (1u<<4) < i ) {
|
||||
|
||||
// Insert a few no-ops to quiet the thread:
|
||||
|
||||
for ( int k = 0 ; k < c ; ++k ) {
|
||||
#if defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ )
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
asm volatile( "nop\n" );
|
||||
#else
|
||||
__asm__ __volatile__( "nop\n" );
|
||||
#endif
|
||||
#elif defined(__PPC64__)
|
||||
asm volatile( "nop\n" );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
// Insert memory pause
|
||||
#if defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ )
|
||||
#if !defined( _WIN32 ) /* IS NOT Microsoft Windows */
|
||||
asm volatile( "pause\n":::"memory" );
|
||||
#else
|
||||
__asm__ __volatile__( "pause\n":::"memory" );
|
||||
#endif
|
||||
#elif defined(__PPC64__)
|
||||
asm volatile( "or 27, 27, 27" ::: "memory" );
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* defined( KOKKOS_ENABLE_ASM ) */
|
||||
}
|
||||
|
||||
}}} // namespace Kokkos::Impl::{anonymous}
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void yield_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
|
||||
#endif
|
||||
|
||||
@ -59,6 +59,13 @@ void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
|
||||
|
||||
void yield_while_equal( volatile int32_t & flag , const int32_t value );
|
||||
void yield_until_equal( volatile int32_t & flag , const int32_t value );
|
||||
|
||||
void yield_while_equal( volatile int64_t & flag , const int64_t value );
|
||||
void yield_until_equal( volatile int64_t & flag , const int64_t value );
|
||||
|
||||
#else
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -71,6 +78,16 @@ void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_while_equal( volatile int32_t & , const int32_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_until_equal( volatile int32_t & , const int32_t ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_while_equal( volatile int64_t & , const int64_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void yield_until_equal( volatile int64_t & , const int64_t ) {}
|
||||
|
||||
#endif
|
||||
|
||||
} /* namespace Impl */
|
||||
@ -59,24 +59,15 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/*\brief Implementation data for task data management, access, and execution.
|
||||
*
|
||||
* Curiously recurring template pattern (CRTP)
|
||||
* to allow static_cast from the
|
||||
* task root type and a task's FunctorType.
|
||||
*
|
||||
* TaskBase< Space , ResultType , FunctorType >
|
||||
* : TaskBase< Space , ResultType , void >
|
||||
* , FunctorType
|
||||
* { ... };
|
||||
*
|
||||
* TaskBase< Space , ResultType , void >
|
||||
* : TaskBase< Space , void , void >
|
||||
* { ... };
|
||||
*/
|
||||
template< typename Space , typename ResultType , typename FunctorType >
|
||||
template< class Space , typename ResultType , class FunctorType >
|
||||
class TaskBase ;
|
||||
|
||||
template< typename Space >
|
||||
class TaskQueue ;
|
||||
|
||||
template< typename Space >
|
||||
class TaskQueueSpecialization ;
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
@ -86,8 +77,217 @@ class TaskBase ;
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< typename Space >
|
||||
class TaskQueueSpecialization ;
|
||||
/** \brief Base class for task management, access, and execution.
|
||||
*
|
||||
* Inheritance structure to allow static_cast from the task root type
|
||||
* and a task's FunctorType.
|
||||
*
|
||||
* // Enable a functor to access the base class
|
||||
* // and provide memory for result value.
|
||||
* TaskBase< Space , ResultType , FunctorType >
|
||||
* : TaskBase< void , void , void >
|
||||
* , FunctorType
|
||||
* { ... };
|
||||
* Followed by memory allocated for result value.
|
||||
*
|
||||
*
|
||||
* States of a task:
|
||||
*
|
||||
* Constructing State, NOT IN a linked list
|
||||
* m_wait == 0
|
||||
* m_next == 0
|
||||
*
|
||||
* Scheduling transition : Constructing -> Waiting
|
||||
* before:
|
||||
* m_wait == 0
|
||||
* m_next == this task's initial dependence, 0 if none
|
||||
* after:
|
||||
* m_wait == EndTag
|
||||
* m_next == EndTag
|
||||
*
|
||||
* Waiting State, IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == next of linked list of tasks
|
||||
*
|
||||
* transition : Waiting -> Executing
|
||||
* before:
|
||||
* m_next == EndTag
|
||||
* after::
|
||||
* m_next == LockTag
|
||||
*
|
||||
* Executing State, NOT IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == LockTag
|
||||
*
|
||||
* Respawn transition : Executing -> Executing-Respawn
|
||||
* before:
|
||||
* m_next == LockTag
|
||||
* after:
|
||||
* m_next == this task's updated dependence, 0 if none
|
||||
*
|
||||
* Executing-Respawn State, NOT IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == this task's updated dependence, 0 if none
|
||||
*
|
||||
* transition : Executing -> Complete
|
||||
* before:
|
||||
* m_wait == head of linked list
|
||||
* after:
|
||||
* m_wait == LockTag
|
||||
*
|
||||
* Complete State, NOT IN a linked list
|
||||
* m_wait == LockTag: cannot add dependence (<=> complete)
|
||||
* m_next == LockTag: not a member of a wait queue
|
||||
*
|
||||
*/
|
||||
template<>
|
||||
class TaskBase< void , void , void >
|
||||
{
|
||||
public:
|
||||
|
||||
enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
|
||||
enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
|
||||
|
||||
template< typename > friend class Kokkos::TaskScheduler ;
|
||||
|
||||
typedef TaskQueue< void > queue_type ;
|
||||
|
||||
typedef void (* function_type) ( TaskBase * , void * );
|
||||
|
||||
// sizeof(TaskBase) == 48
|
||||
|
||||
function_type m_apply ; ///< Apply function pointer
|
||||
queue_type * m_queue ; ///< Pointer to queue
|
||||
TaskBase * m_wait ; ///< Linked list of tasks waiting on this
|
||||
TaskBase * m_next ; ///< Waiting linked-list next
|
||||
int32_t m_ref_count ; ///< Reference count
|
||||
int32_t m_alloc_size ; ///< Allocation size
|
||||
int32_t m_dep_count ; ///< Aggregate's number of dependences
|
||||
int16_t m_task_type ; ///< Type of task
|
||||
int16_t m_priority ; ///< Priority of runnable task
|
||||
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
TaskBase & operator = ( const TaskBase & ) = delete ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
TaskBase()
|
||||
: m_apply( 0 )
|
||||
, m_queue( 0 )
|
||||
, m_wait( 0 )
|
||||
, m_next( 0 )
|
||||
, m_ref_count( 0 )
|
||||
, m_alloc_size( 0 )
|
||||
, m_dep_count( 0 )
|
||||
, m_task_type( 0 )
|
||||
, m_priority( 0 )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskBase * volatile * aggregate_dependences() volatile
|
||||
{ return reinterpret_cast<TaskBase*volatile*>( this + 1 ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool requested_respawn()
|
||||
{
|
||||
// This should only be called when a task has finished executing and is
|
||||
// in the transition to either the complete or executing-respawn state.
|
||||
TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
|
||||
return lock != m_next;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( TaskBase* dep )
|
||||
{
|
||||
// Precondition: lock == m_next
|
||||
|
||||
TaskBase * const lock = (TaskBase *) LockTag ;
|
||||
|
||||
// Assign dependence to m_next. It will be processed in the subsequent
|
||||
// call to schedule. Error if the dependence is reset.
|
||||
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
|
||||
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
|
||||
}
|
||||
|
||||
if ( 0 != dep ) {
|
||||
// The future may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
Kokkos::atomic_increment( &(dep->m_ref_count) );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int32_t reference_count() const
|
||||
{ return *((int32_t volatile *)( & m_ref_count )); }
|
||||
|
||||
};
|
||||
|
||||
static_assert( sizeof(TaskBase<void,void,void>) == 48
|
||||
, "Verifying expected sizeof(TaskBase<void,void,void>)" );
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ResultType >
|
||||
struct TaskResult {
|
||||
|
||||
enum : int32_t { size = sizeof(ResultType) };
|
||||
|
||||
using reference_type = ResultType & ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
ResultType * ptr( TaskBase<void,void,void> * task )
|
||||
{
|
||||
return reinterpret_cast< ResultType * >
|
||||
( reinterpret_cast< char * >(task) + task->m_alloc_size - sizeof(ResultType) );
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
reference_type get( TaskBase<void,void,void> * task )
|
||||
{ return *ptr( task ); }
|
||||
};
|
||||
|
||||
template<>
|
||||
struct TaskResult< void > {
|
||||
|
||||
enum : int32_t { size = 0 };
|
||||
|
||||
using reference_type = void ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void * ptr( TaskBase<void,void,void> * ) { return (void*) 0 ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
reference_type get( TaskBase<void,void,void> * ) {}
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskQueue< void > {};
|
||||
|
||||
/** \brief Manage task allocation, deallocation, and scheduling.
|
||||
*
|
||||
@ -95,7 +295,7 @@ class TaskQueueSpecialization ;
|
||||
* All other aspects of task management have shared implementation.
|
||||
*/
|
||||
template< typename ExecSpace >
|
||||
class TaskQueue {
|
||||
class TaskQueue : public TaskQueue<void> {
|
||||
private:
|
||||
|
||||
friend class TaskQueueSpecialization< ExecSpace > ;
|
||||
@ -106,7 +306,7 @@ private:
|
||||
using memory_space = typename specialization::memory_space ;
|
||||
using device_type = Kokkos::Device< execution_space , memory_space > ;
|
||||
using memory_pool = Kokkos::MemoryPool< device_type > ;
|
||||
using task_root_type = Kokkos::Impl::TaskBase<execution_space,void,void> ;
|
||||
using task_root_type = Kokkos::Impl::TaskBase<void,void,void> ;
|
||||
|
||||
struct Destroy {
|
||||
TaskQueue * m_queue ;
|
||||
@ -198,12 +398,10 @@ public:
|
||||
}
|
||||
|
||||
// Assign task pointer with reference counting of assigned tasks
|
||||
template< typename LV , typename RV >
|
||||
KOKKOS_FUNCTION static
|
||||
void assign( TaskBase< execution_space,LV,void> ** const lhs
|
||||
, TaskBase< execution_space,RV,void> * const rhs )
|
||||
void assign( task_root_type ** const lhs
|
||||
, task_root_type * const rhs )
|
||||
{
|
||||
using task_lhs = TaskBase< execution_space,LV,void> ;
|
||||
#if 0
|
||||
{
|
||||
printf( "assign( 0x%lx { 0x%lx %d %d } , 0x%lx { 0x%lx %d %d } )\n"
|
||||
@ -225,7 +423,7 @@ public:
|
||||
|
||||
// Force write of *lhs
|
||||
|
||||
*static_cast< task_lhs * volatile * >(lhs) = rhs ;
|
||||
*static_cast< task_root_type * volatile * >(lhs) = rhs ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
@ -238,6 +436,38 @@ public:
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
void deallocate( void * p , size_t n ); ///< Deallocate to the memory pool
|
||||
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Allocation size for a spawned task */
|
||||
|
||||
template< typename FunctorType >
|
||||
KOKKOS_FUNCTION
|
||||
size_t spawn_allocation_size() const
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
enum : size_t { align = ( 1 << 4 ) , align_mask = align - 1 };
|
||||
enum : size_t { task_size = sizeof(task_type) };
|
||||
enum : size_t { result_size = Impl::TaskResult< value_type >::size };
|
||||
enum : size_t { alloc_size =
|
||||
( ( task_size + align_mask ) & ~align_mask ) +
|
||||
( ( result_size + align_mask ) & ~align_mask ) };
|
||||
|
||||
return m_memory.allocate_block_size( task_size );
|
||||
}
|
||||
|
||||
/**\brief Allocation size for a when_all aggregate */
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
size_t when_all_allocation_size( int narg ) const
|
||||
{
|
||||
return m_memory.allocate_block_size( sizeof(task_root_type) + narg * sizeof(task_root_type*) );
|
||||
}
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
@ -249,261 +479,9 @@ public:
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskBase< void , void , void > {
|
||||
public:
|
||||
enum : int16_t { TaskTeam = 0 , TaskSingle = 1 , Aggregate = 2 };
|
||||
enum : uintptr_t { LockTag = ~uintptr_t(0) , EndTag = ~uintptr_t(1) };
|
||||
};
|
||||
|
||||
/** \brief Base class for task management, access, and execution.
|
||||
*
|
||||
* Inheritance structure to allow static_cast from the task root type
|
||||
* and a task's FunctorType.
|
||||
*
|
||||
* // Enable a Future to access result data
|
||||
* TaskBase< Space , ResultType , void >
|
||||
* : TaskBase< void , void , void >
|
||||
* { ... };
|
||||
*
|
||||
* // Enable a functor to access the base class
|
||||
* TaskBase< Space , ResultType , FunctorType >
|
||||
* : TaskBase< Space , ResultType , void >
|
||||
* , FunctorType
|
||||
* { ... };
|
||||
*
|
||||
*
|
||||
* States of a task:
|
||||
*
|
||||
* Constructing State, NOT IN a linked list
|
||||
* m_wait == 0
|
||||
* m_next == 0
|
||||
*
|
||||
* Scheduling transition : Constructing -> Waiting
|
||||
* before:
|
||||
* m_wait == 0
|
||||
* m_next == this task's initial dependence, 0 if none
|
||||
* after:
|
||||
* m_wait == EndTag
|
||||
* m_next == EndTag
|
||||
*
|
||||
* Waiting State, IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == next of linked list of tasks
|
||||
*
|
||||
* transition : Waiting -> Executing
|
||||
* before:
|
||||
* m_next == EndTag
|
||||
* after::
|
||||
* m_next == LockTag
|
||||
*
|
||||
* Executing State, NOT IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == LockTag
|
||||
*
|
||||
* Respawn transition : Executing -> Executing-Respawn
|
||||
* before:
|
||||
* m_next == LockTag
|
||||
* after:
|
||||
* m_next == this task's updated dependence, 0 if none
|
||||
*
|
||||
* Executing-Respawn State, NOT IN a linked list
|
||||
* m_apply != 0
|
||||
* m_queue != 0
|
||||
* m_ref_count > 0
|
||||
* m_wait == head of linked list of tasks waiting on this task
|
||||
* m_next == this task's updated dependence, 0 if none
|
||||
*
|
||||
* transition : Executing -> Complete
|
||||
* before:
|
||||
* m_wait == head of linked list
|
||||
* after:
|
||||
* m_wait == LockTag
|
||||
*
|
||||
* Complete State, NOT IN a linked list
|
||||
* m_wait == LockTag: cannot add dependence
|
||||
* m_next == LockTag: not a member of a wait queue
|
||||
*
|
||||
*/
|
||||
template< typename ExecSpace >
|
||||
class TaskBase< ExecSpace , void , void >
|
||||
{
|
||||
public:
|
||||
|
||||
enum : int16_t { TaskTeam = TaskBase<void,void,void>::TaskTeam
|
||||
, TaskSingle = TaskBase<void,void,void>::TaskSingle
|
||||
, Aggregate = TaskBase<void,void,void>::Aggregate };
|
||||
|
||||
enum : uintptr_t { LockTag = TaskBase<void,void,void>::LockTag
|
||||
, EndTag = TaskBase<void,void,void>::EndTag };
|
||||
|
||||
using execution_space = ExecSpace ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
|
||||
template< typename > friend class Kokkos::TaskScheduler ;
|
||||
|
||||
typedef void (* function_type) ( TaskBase * , void * );
|
||||
|
||||
// sizeof(TaskBase) == 48
|
||||
|
||||
function_type m_apply ; ///< Apply function pointer
|
||||
queue_type * m_queue ; ///< Queue in which this task resides
|
||||
TaskBase * m_wait ; ///< Linked list of tasks waiting on this
|
||||
TaskBase * m_next ; ///< Waiting linked-list next
|
||||
int32_t m_ref_count ; ///< Reference count
|
||||
int32_t m_alloc_size ; ///< Allocation size
|
||||
int32_t m_dep_count ; ///< Aggregate's number of dependences
|
||||
int16_t m_task_type ; ///< Type of task
|
||||
int16_t m_priority ; ///< Priority of runnable task
|
||||
|
||||
TaskBase() = delete ;
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
TaskBase & operator = ( const TaskBase & ) = delete ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
// Constructor for a runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, TaskBase * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
) noexcept
|
||||
: m_apply( arg_apply )
|
||||
, m_queue( arg_queue )
|
||||
, m_wait( 0 )
|
||||
, m_next( arg_dependence )
|
||||
, m_ref_count( arg_ref_count )
|
||||
, m_alloc_size( arg_alloc_size )
|
||||
, m_dep_count( 0 )
|
||||
, m_task_type( arg_task_type )
|
||||
, m_priority( arg_priority )
|
||||
{}
|
||||
|
||||
// Constructor for an aggregate task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase( queue_type * arg_queue
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_dep_count
|
||||
) noexcept
|
||||
: m_apply( 0 )
|
||||
, m_queue( arg_queue )
|
||||
, m_wait( 0 )
|
||||
, m_next( 0 )
|
||||
, m_ref_count( arg_ref_count )
|
||||
, m_alloc_size( arg_alloc_size )
|
||||
, m_dep_count( arg_dep_count )
|
||||
, m_task_type( Aggregate )
|
||||
, m_priority( 0 )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskBase ** aggregate_dependences()
|
||||
{ return reinterpret_cast<TaskBase**>( this + 1 ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool requested_respawn()
|
||||
{
|
||||
// This should only be called when a task has finished executing and is
|
||||
// in the transition to either the complete or executing-respawn state.
|
||||
TaskBase * const lock = reinterpret_cast< TaskBase * >( LockTag );
|
||||
return lock != m_next;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( TaskBase* dep )
|
||||
{
|
||||
// Precondition: lock == m_next
|
||||
|
||||
TaskBase * const lock = (TaskBase *) LockTag ;
|
||||
|
||||
// Assign dependence to m_next. It will be processed in the subsequent
|
||||
// call to schedule. Error if the dependence is reset.
|
||||
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
|
||||
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
|
||||
}
|
||||
|
||||
if ( 0 != dep ) {
|
||||
// The future may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
Kokkos::atomic_increment( &(dep->m_ref_count) );
|
||||
}
|
||||
}
|
||||
|
||||
using get_return_type = void ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
get_return_type get() const {}
|
||||
};
|
||||
|
||||
template < typename ExecSpace , typename ResultType >
|
||||
class TaskBase< ExecSpace , ResultType , void >
|
||||
: public TaskBase< ExecSpace , void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
using root_type = TaskBase<ExecSpace,void,void> ;
|
||||
using function_type = typename root_type::function_type ;
|
||||
using queue_type = typename root_type::queue_type ;
|
||||
|
||||
static_assert( sizeof(root_type) == 48 , "" );
|
||||
|
||||
TaskBase() = delete ;
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
TaskBase & operator = ( const TaskBase & ) = delete ;
|
||||
|
||||
public:
|
||||
|
||||
ResultType m_result ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
// Constructor for runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, root_type * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
)
|
||||
: root_type( arg_apply
|
||||
, arg_queue
|
||||
, arg_dependence
|
||||
, arg_ref_count
|
||||
, arg_alloc_size
|
||||
, arg_task_type
|
||||
, arg_priority
|
||||
)
|
||||
, m_result()
|
||||
{}
|
||||
|
||||
using get_return_type = ResultType const & ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
get_return_type get() const { return m_result ; }
|
||||
};
|
||||
|
||||
template< typename ExecSpace , typename ResultType , typename FunctorType >
|
||||
template< class ExecSpace , typename ResultType , class FunctorType >
|
||||
class TaskBase
|
||||
: public TaskBase< ExecSpace , ResultType , void >
|
||||
: public TaskBase< void , void , void >
|
||||
, public FunctorType
|
||||
{
|
||||
private:
|
||||
@ -516,50 +494,31 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
using root_type = TaskBase< ExecSpace , void , void > ;
|
||||
using base_type = TaskBase< ExecSpace , ResultType , void > ;
|
||||
using specialization = TaskQueueSpecialization< ExecSpace > ;
|
||||
using function_type = typename root_type::function_type ;
|
||||
using queue_type = typename root_type::queue_type ;
|
||||
using member_type = typename specialization::member_type ;
|
||||
using root_type = TaskBase< void , void , void > ;
|
||||
using functor_type = FunctorType ;
|
||||
using result_type = ResultType ;
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_functor
|
||||
( Type * const task
|
||||
, typename std::enable_if
|
||||
< std::is_same< typename Type::result_type , void >::value
|
||||
, member_type * const
|
||||
>::type member
|
||||
)
|
||||
{
|
||||
using fType = typename Type::functor_type ;
|
||||
static_cast<fType*>(task)->operator()( *member );
|
||||
}
|
||||
using specialization = TaskQueueSpecialization< ExecSpace > ;
|
||||
using member_type = typename specialization::member_type ;
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_functor
|
||||
( Type * const task
|
||||
, typename std::enable_if
|
||||
< ! std::is_same< typename Type::result_type , void >::value
|
||||
, member_type * const
|
||||
>::type member
|
||||
)
|
||||
{
|
||||
using fType = typename Type::functor_type ;
|
||||
static_cast<fType*>(task)->operator()( *member , task->m_result );
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void apply_functor( member_type * const member , void * )
|
||||
{ functor_type::operator()( *member ); }
|
||||
|
||||
template< typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void apply_functor( member_type * const member
|
||||
, T * const result )
|
||||
{ functor_type::operator()( *member , *result ); }
|
||||
|
||||
KOKKOS_FUNCTION static
|
||||
void apply( root_type * root , void * exec )
|
||||
{
|
||||
TaskBase * const task = static_cast< TaskBase * >( root );
|
||||
member_type * const member = reinterpret_cast< member_type * >( exec );
|
||||
result_type * const result = TaskResult< result_type >::ptr( task );
|
||||
|
||||
TaskBase::template apply_functor( task , member );
|
||||
task->apply_functor( member , result );
|
||||
|
||||
// Task may be serial or team.
|
||||
// If team then must synchronize before querying if respawn was requested.
|
||||
@ -576,26 +535,9 @@ public:
|
||||
}
|
||||
|
||||
// Constructor for runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, root_type * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
, FunctorType && arg_functor
|
||||
)
|
||||
: base_type( arg_apply
|
||||
, arg_queue
|
||||
, arg_dependence
|
||||
, arg_ref_count
|
||||
, arg_alloc_size
|
||||
, arg_task_type
|
||||
, arg_priority
|
||||
)
|
||||
, functor_type( arg_functor )
|
||||
{}
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
TaskBase( FunctorType && arg_functor )
|
||||
: root_type() , functor_type( std::move(arg_functor) ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~TaskBase() {}
|
||||
|
||||
@ -44,6 +44,8 @@
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#define KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING 0
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
@ -100,9 +102,11 @@ KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::decrement
|
||||
( TaskQueue< ExecSpace >::task_root_type * task )
|
||||
{
|
||||
const int count = Kokkos::atomic_fetch_add(&(task->m_ref_count),-1);
|
||||
task_root_type volatile & t = *task ;
|
||||
|
||||
#if 0
|
||||
const int count = Kokkos::atomic_fetch_add(&(t.m_ref_count),-1);
|
||||
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
if ( 1 == count ) {
|
||||
printf( "decrement-destroy( 0x%lx { 0x%lx %d %d } )\n"
|
||||
, uintptr_t( task )
|
||||
@ -114,9 +118,13 @@ void TaskQueue< ExecSpace >::decrement
|
||||
#endif
|
||||
|
||||
if ( ( 1 == count ) &&
|
||||
( task->m_next == (task_root_type *) task_root_type::LockTag ) ) {
|
||||
( t.m_next == (task_root_type *) task_root_type::LockTag ) ) {
|
||||
// Reference count is zero and task is complete, deallocate.
|
||||
task->m_queue->deallocate( task , task->m_alloc_size );
|
||||
|
||||
TaskQueue< ExecSpace > * const queue =
|
||||
static_cast< TaskQueue< ExecSpace > * >( t.m_queue );
|
||||
|
||||
queue->deallocate( task , t.m_alloc_size );
|
||||
}
|
||||
else if ( count <= 1 ) {
|
||||
Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
|
||||
@ -171,7 +179,7 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
// Fail the push attempt if the queue is locked;
|
||||
// otherwise retry until the push succeeds.
|
||||
|
||||
#if 0
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "push_task( 0x%lx { 0x%lx } 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
|
||||
, uintptr_t(queue)
|
||||
, uintptr_t(*queue)
|
||||
@ -186,9 +194,9 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
task_root_type * const zero = (task_root_type *) 0 ;
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
|
||||
task_root_type * volatile * const next = & task->m_next ;
|
||||
task_root_type * volatile & next = task->m_next ;
|
||||
|
||||
if ( zero != *next ) {
|
||||
if ( zero != next ) {
|
||||
Kokkos::abort("TaskQueue::push_task ERROR: already a member of another queue" );
|
||||
}
|
||||
|
||||
@ -196,9 +204,9 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
|
||||
while ( lock != y ) {
|
||||
|
||||
*next = y ;
|
||||
next = y ;
|
||||
|
||||
// Do not proceed until '*next' has been stored.
|
||||
// Do not proceed until 'next' has been stored.
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task_root_type * const x = y ;
|
||||
@ -211,9 +219,9 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
// Failed, replace 'task->m_next' value since 'task' remains
|
||||
// not a member of a queue.
|
||||
|
||||
*next = zero ;
|
||||
next = zero ;
|
||||
|
||||
// Do not proceed until '*next' has been stored.
|
||||
// Do not proceed until 'next' has been stored.
|
||||
Kokkos::memory_fence();
|
||||
|
||||
return false ;
|
||||
@ -270,11 +278,13 @@ TaskQueue< ExecSpace >::pop_ready_task
|
||||
// This thread has exclusive access to
|
||||
// the queue and the popped task's m_next.
|
||||
|
||||
*queue = task->m_next ; task->m_next = lock ;
|
||||
task_root_type * volatile & next = task->m_next ;
|
||||
|
||||
*queue = next ; next = lock ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
#if 0
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
|
||||
, uintptr_t(queue)
|
||||
, uintptr_t(task)
|
||||
@ -323,7 +333,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
|
||||
// task->m_wait == head of linked list (queue)
|
||||
// task->m_next == member of linked list (queue)
|
||||
|
||||
#if 0
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
@ -337,20 +347,22 @@ void TaskQueue< ExecSpace >::schedule_runnable
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
task_root_type volatile & t = *task ;
|
||||
|
||||
bool respawn = false ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( zero == task->m_wait ) {
|
||||
if ( zero == t.m_wait ) {
|
||||
// Task in Constructing state
|
||||
// - Transition to Waiting state
|
||||
// Preconditions:
|
||||
// - call occurs exclusively within a single thread
|
||||
|
||||
task->m_wait = end ;
|
||||
t.m_wait = end ;
|
||||
// Task in Waiting state
|
||||
}
|
||||
else if ( lock != task->m_wait ) {
|
||||
else if ( lock != t.m_wait ) {
|
||||
// Task in Executing state with Respawn request
|
||||
// - Update dependence
|
||||
// - Transition to Waiting state
|
||||
@ -373,7 +385,9 @@ void TaskQueue< ExecSpace >::schedule_runnable
|
||||
|
||||
// Exclusive access so don't need an atomic exchange
|
||||
// task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
|
||||
task_root_type * dep = task->m_next ; task->m_next = zero ;
|
||||
task_root_type * dep = t.m_next ; t.m_next = zero ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
const bool is_ready =
|
||||
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
|
||||
@ -398,7 +412,7 @@ void TaskQueue< ExecSpace >::schedule_runnable
|
||||
Kokkos::atomic_increment( & m_ready_count );
|
||||
|
||||
task_root_type * volatile * const ready_queue =
|
||||
& m_ready[ task->m_priority ][ task->m_task_type ];
|
||||
& m_ready[ t.m_priority ][ t.m_task_type ];
|
||||
|
||||
// A push_task fails if the ready queue is locked.
|
||||
// A ready queue is only locked during a push or pop;
|
||||
@ -441,7 +455,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
// task->m_wait == head of linked list (queue)
|
||||
// task->m_next == member of linked list (queue)
|
||||
|
||||
#if 0
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
@ -455,18 +469,20 @@ void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
task_root_type volatile & t = *task ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( zero == task->m_wait ) {
|
||||
if ( zero == t.m_wait ) {
|
||||
// Task in Constructing state
|
||||
// - Transition to Waiting state
|
||||
// Preconditions:
|
||||
// - call occurs exclusively within a single thread
|
||||
|
||||
task->m_wait = end ;
|
||||
t.m_wait = end ;
|
||||
// Task in Waiting state
|
||||
}
|
||||
else if ( lock == task->m_wait ) {
|
||||
else if ( lock == t.m_wait ) {
|
||||
// Task in Complete state
|
||||
Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
|
||||
}
|
||||
@ -477,14 +493,14 @@ void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
// (1) created or
|
||||
// (2) being removed from a completed task's wait list.
|
||||
|
||||
task_root_type ** const aggr = task->aggregate_dependences();
|
||||
task_root_type * volatile * const aggr = t.aggregate_dependences();
|
||||
|
||||
// Assume the 'when_all' is complete until a dependence is
|
||||
// found that is not complete.
|
||||
|
||||
bool is_complete = true ;
|
||||
|
||||
for ( int i = task->m_dep_count ; 0 < i && is_complete ; ) {
|
||||
for ( int i = t.m_dep_count ; 0 < i && is_complete ; ) {
|
||||
|
||||
--i ;
|
||||
|
||||
@ -523,7 +539,7 @@ void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
// Complete the when_all 'task' to schedule other tasks
|
||||
// that are waiting for the when_all 'task' to complete.
|
||||
|
||||
task->m_next = lock ;
|
||||
t.m_next = lock ;
|
||||
|
||||
complete( task );
|
||||
|
||||
@ -573,7 +589,7 @@ void TaskQueue< ExecSpace >::complete
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
#if 0
|
||||
#if KOKKOS_IMPL_DEBUG_TASKDAG_SCHEDULING
|
||||
printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
@ -584,11 +600,13 @@ void TaskQueue< ExecSpace >::complete
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
const bool runnable = task_root_type::Aggregate != task->m_task_type ;
|
||||
task_root_type volatile & t = *task ;
|
||||
|
||||
const bool runnable = task_root_type::Aggregate != t.m_task_type ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( runnable && lock != task->m_next ) {
|
||||
if ( runnable && lock != t.m_next ) {
|
||||
// Is a runnable task has finished executing and requested respawn.
|
||||
// Schedule the task for subsequent execution.
|
||||
|
||||
@ -607,7 +625,7 @@ void TaskQueue< ExecSpace >::complete
|
||||
// Stop other tasks from adding themselves to this task's wait queue
|
||||
// by locking the head of this task's wait queue.
|
||||
|
||||
task_root_type * x = Kokkos::atomic_exchange( & task->m_wait , lock );
|
||||
task_root_type * x = Kokkos::atomic_exchange( & t.m_wait , lock );
|
||||
|
||||
if ( x != (task_root_type *) lock ) {
|
||||
|
||||
@ -627,9 +645,13 @@ void TaskQueue< ExecSpace >::complete
|
||||
// Have exclusive access to 'x' until it is scheduled
|
||||
// Set x->m_next = zero <= no dependence, not a respawn
|
||||
|
||||
task_root_type * const next = x->m_next ; x->m_next = 0 ;
|
||||
task_root_type volatile & vx = *x ;
|
||||
|
||||
if ( task_root_type::Aggregate != x->m_task_type ) {
|
||||
task_root_type * const next = vx.m_next ; vx.m_next = 0 ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
if ( task_root_type::Aggregate != vx.m_task_type ) {
|
||||
schedule_runnable( x );
|
||||
}
|
||||
else {
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -47,7 +47,6 @@
|
||||
#include <Kokkos_Array.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template< class DataType , class ArrayLayout , class V , size_t N , class P >
|
||||
@ -94,13 +93,12 @@ public:
|
||||
typedef typename ViewDataType< non_const_scalar_type , array_scalar_dimension >::type non_const_scalar_array_type ;
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief View mapping for non-specialized data type and standard layout */
|
||||
@ -597,7 +595,7 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -96,6 +96,27 @@ struct is_view_label< const char[N] > : public std::true_type {};
|
||||
template< typename ... P >
|
||||
struct ViewCtorProp ;
|
||||
|
||||
// Forward declare
|
||||
template< typename Specialize , typename T >
|
||||
struct CommonViewAllocProp ;
|
||||
|
||||
/* Common value_type stored as ViewCtorProp
|
||||
*/
|
||||
template< typename Specialize , typename T >
|
||||
struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
|
||||
{
|
||||
ViewCtorProp() = default ;
|
||||
ViewCtorProp( const ViewCtorProp & ) = default ;
|
||||
ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
|
||||
|
||||
using type = CommonViewAllocProp<Specialize,T> ;
|
||||
|
||||
ViewCtorProp( const type & arg ) : value( arg ) {}
|
||||
ViewCtorProp( type && arg ) : value( arg ) {}
|
||||
|
||||
type value ;
|
||||
};
|
||||
|
||||
/* std::integral_constant<unsigned,I> are dummy arguments
|
||||
* that avoid duplicate base class errors
|
||||
*/
|
||||
|
||||
@ -62,7 +62,6 @@
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template< unsigned I , size_t ... Args >
|
||||
@ -250,7 +249,7 @@ struct ViewDimensionAssignable< ViewDimension< DstArgs ... >
|
||||
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -266,14 +265,11 @@ struct ALL_t {
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
using Kokkos::Impl::ALL_t ;
|
||||
|
||||
template< class T >
|
||||
struct is_integral_extent_type
|
||||
{ enum { value = std::is_same<T,Kokkos::Experimental::Impl::ALL_t>::value ? 1 : 0 }; };
|
||||
{ enum { value = std::is_same<T,Kokkos::Impl::ALL_t>::value ? 1 : 0 }; };
|
||||
|
||||
template< class iType >
|
||||
struct is_integral_extent_type< std::pair<iType,iType> >
|
||||
@ -314,10 +310,10 @@ struct SubviewLegalArgsCompileTime;
|
||||
|
||||
template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
|
||||
struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
|
||||
enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
|
||||
enum { value =(((CurrentArg==RankDest-1) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
|
||||
((CurrentArg>=RankDest) && (std::is_integral<Arg>::value)) ||
|
||||
((CurrentArg<RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value)) ||
|
||||
((CurrentArg==0) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value))
|
||||
((CurrentArg==0) && (Kokkos::Impl::is_integral_extent_type<Arg>::value))
|
||||
) && (SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
|
||||
};
|
||||
|
||||
@ -331,7 +327,7 @@ struct SubviewLegalArgsCompileTime<Kokkos::LayoutLeft, Kokkos::LayoutLeft, RankD
|
||||
|
||||
template<int RankDest, int RankSrc, int CurrentArg, class Arg, class ... SubViewArgs>
|
||||
struct SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg, Arg, SubViewArgs...> {
|
||||
enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Experimental::Impl::is_integral_extent_type<Arg>::value)) ||
|
||||
enum { value =(((CurrentArg==RankSrc-RankDest) && (Kokkos::Impl::is_integral_extent_type<Arg>::value)) ||
|
||||
((CurrentArg<RankSrc-RankDest) && (std::is_integral<Arg>::value)) ||
|
||||
((CurrentArg>=RankSrc-RankDest) && (std::is_same<Arg,Kokkos::Impl::ALL_t>::value))
|
||||
) && (SubviewLegalArgsCompileTime<Kokkos::LayoutRight, Kokkos::LayoutRight, RankDest, RankSrc, CurrentArg+1, SubViewArgs...>::value)};
|
||||
@ -403,7 +399,7 @@ private:
|
||||
bool set( unsigned domain_rank
|
||||
, unsigned range_rank
|
||||
, const ViewDimension< DimArgs ... > & dim
|
||||
, const Kokkos::Experimental::Impl::ALL_t
|
||||
, const Kokkos::Impl::ALL_t
|
||||
, Args ... args )
|
||||
{
|
||||
m_begin[ domain_rank ] = 0 ;
|
||||
@ -519,7 +515,7 @@ private:
|
||||
, unsigned domain_rank
|
||||
, unsigned range_rank
|
||||
, const ViewDimension< DimArgs ... > & dim
|
||||
, const Kokkos::Experimental::Impl::ALL_t
|
||||
, const Kokkos::Impl::ALL_t
|
||||
, Args ... args ) const
|
||||
{
|
||||
const int n = std::min( buf_len ,
|
||||
@ -670,13 +666,12 @@ public:
|
||||
{ return unsigned(i) < InternalRangeRank ? m_index[i] : ~0u ; }
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Given a value type and dimension generate the View data type */
|
||||
@ -814,13 +809,12 @@ public:
|
||||
typedef non_const_type non_const_scalar_array_type ;
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template < class Dimension , class Layout , typename Enable = void >
|
||||
@ -1228,14 +1222,14 @@ private:
|
||||
|
||||
// If memory alignment is a multiple of the trivial scalar size then attempt to align.
|
||||
enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
|
||||
enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
|
||||
enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static constexpr size_t stride( size_t const N )
|
||||
{
|
||||
return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
|
||||
? N + align - ( N % div_ok ) : N ;
|
||||
}
|
||||
{
|
||||
return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
|
||||
? N + align - ( N % div_ok ) : N ;
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
@ -1707,12 +1701,12 @@ private:
|
||||
|
||||
// If memory alignment is a multiple of the trivial scalar size then attempt to align.
|
||||
enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
|
||||
enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
|
||||
enum { div_ok = (div != 0) ? div : 1 }; // To valid modulo zero in constexpr
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static constexpr size_t stride( size_t const N )
|
||||
{
|
||||
return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
|
||||
return ( (align != 0) && ((Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align) < N) && ((N % div_ok) != 0) )
|
||||
? N + align - ( N % div_ok ) : N ;
|
||||
}
|
||||
};
|
||||
@ -2225,13 +2219,12 @@ public:
|
||||
{}
|
||||
};
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief ViewDataHandle provides the type of the 'data handle' which the view
|
||||
@ -2422,13 +2415,12 @@ struct ViewDataHandle< Traits ,
|
||||
return handle_type( arg_data_ptr + offset );
|
||||
}
|
||||
};
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -2451,8 +2443,9 @@ template< class ExecSpace , class ValueType >
|
||||
struct ViewValueFunctor< ExecSpace , ValueType , false /* is_scalar */ >
|
||||
{
|
||||
typedef Kokkos::RangePolicy< ExecSpace > PolicyType ;
|
||||
typedef typename ExecSpace::execution_space Exec;
|
||||
|
||||
ExecSpace space ;
|
||||
Exec space ;
|
||||
ValueType * ptr ;
|
||||
size_t n ;
|
||||
bool destroy ;
|
||||
@ -2597,6 +2590,9 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
typedef void printable_label_typedef;
|
||||
enum { is_managed = Traits::is_managed };
|
||||
|
||||
//----------------------------------------
|
||||
// Domain dimensions
|
||||
|
||||
@ -2944,7 +2940,7 @@ public:
|
||||
Kokkos::abort("View Assignment: trying to assign runtime dimension to non matching compile time dimension.");
|
||||
}
|
||||
dst.m_offset = dst_offset_type( src.m_offset );
|
||||
dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
|
||||
dst.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
|
||||
}
|
||||
};
|
||||
|
||||
@ -3102,7 +3098,7 @@ public:
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
}}} // namespace Kokkos::Experimental::Impl
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -3151,6 +3147,77 @@ void view_error_operator_bounds
|
||||
view_error_operator_bounds<R+1>(buf+n,len-n,map,args...);
|
||||
}
|
||||
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
/* Check #3: is the View managed as determined by the MemoryTraits? */
|
||||
template< class MapType,
|
||||
bool is_managed = (MapType::is_managed != 0) >
|
||||
struct OperatorBoundsErrorOnDevice;
|
||||
|
||||
template< class MapType >
|
||||
struct OperatorBoundsErrorOnDevice< MapType, false > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void run(MapType const&) {
|
||||
Kokkos::abort("View bounds error");
|
||||
}
|
||||
};
|
||||
|
||||
template< class MapType >
|
||||
struct OperatorBoundsErrorOnDevice< MapType, true > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void run(MapType const& map) {
|
||||
char const* const user_alloc_start = reinterpret_cast<char const*>(map.data());
|
||||
char const* const header_start = user_alloc_start - sizeof(SharedAllocationHeader);
|
||||
SharedAllocationHeader const* const header =
|
||||
reinterpret_cast<SharedAllocationHeader const*>(header_start);
|
||||
char const* const label = header->label();
|
||||
enum { LEN = 128 };
|
||||
char msg[LEN];
|
||||
char const* const first_part = "View bounds error of view ";
|
||||
char* p = msg;
|
||||
char* const end = msg + LEN - 1;
|
||||
for (char const* p2 = first_part; (*p2 != '\0') && (p < end); ++p, ++p2) {
|
||||
*p = *p2;
|
||||
}
|
||||
for (char const* p2 = label; (*p2 != '\0') && (p < end); ++p, ++p2) {
|
||||
*p = *p2;
|
||||
}
|
||||
*p = '\0';
|
||||
Kokkos::abort(msg);
|
||||
}
|
||||
};
|
||||
|
||||
/* Check #2: does the ViewMapping have the printable_label_typedef defined?
|
||||
See above that only the non-specialized standard-layout ViewMapping has
|
||||
this defined by default.
|
||||
The existence of this typedef indicates the existence of MapType::is_managed */
|
||||
template< class T, class Enable = void >
|
||||
struct has_printable_label_typedef : public std::false_type {};
|
||||
|
||||
template<class T>
|
||||
struct has_printable_label_typedef<
|
||||
T, typename enable_if_type<typename T::printable_label_typedef>::type>
|
||||
: public std::true_type
|
||||
{};
|
||||
|
||||
template< class MapType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator_bounds_error_on_device(
|
||||
MapType const&,
|
||||
std::false_type) {
|
||||
Kokkos::abort("View bounds error");
|
||||
}
|
||||
|
||||
template< class MapType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator_bounds_error_on_device(
|
||||
MapType const& map,
|
||||
std::true_type) {
|
||||
OperatorBoundsErrorOnDevice< MapType >::run(map);
|
||||
}
|
||||
|
||||
#endif // ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
template< class MemorySpace , class MapType , class ... Args >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void view_verify_operator_bounds
|
||||
@ -3166,7 +3233,17 @@ void view_verify_operator_bounds
|
||||
view_error_operator_bounds<0>( buffer + n , LEN - n , map , args ... );
|
||||
Kokkos::Impl::throw_runtime_exception(std::string(buffer));
|
||||
#else
|
||||
Kokkos::abort("View bounds error");
|
||||
/* Check #1: is there a SharedAllocationRecord?
|
||||
(we won't use it, but if its not there then there isn't
|
||||
a corresponding SharedAllocationHeader containing a label).
|
||||
This check should cover the case of Views that don't
|
||||
have the Unmanaged trait but were initialized by pointer. */
|
||||
if (tracker.has_record()) {
|
||||
operator_bounds_error_on_device<MapType>(
|
||||
map, has_printable_label_typedef<MapType>());
|
||||
} else {
|
||||
Kokkos::abort("View bounds error");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -48,7 +48,6 @@
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
// View mapping for rank two tiled array
|
||||
@ -195,11 +194,9 @@ struct ViewMapping
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< typename T , unsigned N0 , unsigned N1 , class ... P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -217,7 +214,6 @@ tile_subview( const Kokkos::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> &
|
||||
( src , SrcLayout() , i_tile0 , i_tile1 );
|
||||
}
|
||||
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,183 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if !defined( _WIN32 )
|
||||
#if defined( KOKKOS_ENABLE_ASM )
|
||||
#if defined( __arm__ ) || defined( __aarch64__ )
|
||||
/* No-operation instruction to idle the thread. */
|
||||
#define KOKKOS_INTERNAL_PAUSE
|
||||
#else
|
||||
/* Pause instruction to prevent excess processor bus usage */
|
||||
#define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory")
|
||||
#endif
|
||||
#define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n")
|
||||
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
|
||||
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
|
||||
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
|
||||
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned i ) noexcept {
|
||||
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
|
||||
case 0u: KOKKOS_INTERNAL_NOP2; break;
|
||||
case 1u: KOKKOS_INTERNAL_NOP4; break;
|
||||
case 2u: KOKKOS_INTERNAL_NOP8; break;
|
||||
case 3u: KOKKOS_INTERNAL_NOP16; break;
|
||||
default: KOKKOS_INTERNAL_NOP32;
|
||||
}
|
||||
KOKKOS_INTERNAL_PAUSE;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include <sched.h>
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else // defined( _WIN32 )
|
||||
#if defined ( KOKKOS_ENABLE_WINTHREAD )
|
||||
#include <process.h>
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
Sleep(0);
|
||||
}
|
||||
}
|
||||
#elif defined( _MSC_VER )
|
||||
#define NOMINMAX
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
YieldProcessor();
|
||||
}
|
||||
}
|
||||
#else
|
||||
#define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory")
|
||||
#define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop")
|
||||
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
|
||||
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
|
||||
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
|
||||
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned i ) noexcept {
|
||||
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
|
||||
case 0: KOKKOS_INTERNAL_NOP2; break;
|
||||
case 1: KOKKOS_INTERNAL_NOP4; break;
|
||||
case 2: KOKKOS_INTERNAL_NOP8; break;
|
||||
case 3: KOKKOS_INTERNAL_NOP16; break;
|
||||
default: KOKKOS_INTERNAL_NOP32;
|
||||
}
|
||||
KOKKOS_INTERNAL_PAUSE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value == flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value != flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value == flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value != flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user