git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@13583 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2015-07-14 19:57:25 +00:00
parent 47e13d72a8
commit 61ec2f78ac
195 changed files with 68433 additions and 0 deletions

View File

@ -0,0 +1,66 @@
KOKKOS_PATH = ../..
GTEST_PATH = ../../TPL/gtest
vpath %.cpp ${KOKKOS_PATH}/core/perf_test
default: build_all
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= $(CXX)
LDFLAGS ?= -lpthread
endif
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
TEST_TARGETS =
TARGETS =
OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o
TARGETS += KokkosCore_PerformanceTest
TEST_TARGETS += test-performance
OBJ_ATOMICS = test_atomic.o
TARGETS += KokkosCore_PerformanceTest_Atomics
TEST_TARGETS += test-atomic
KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest
KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics
test-performance: KokkosCore_PerformanceTest
./KokkosCore_PerformanceTest
test-atomic: KokkosCore_PerformanceTest_Atomics
./KokkosCore_PerformanceTest_Atomics
build_all: $(TARGETS)
test: $(TEST_TARGETS)
clean: kokkos-clean
rm -f *.o $(TARGETS)
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc

View File

@ -0,0 +1,309 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BLAS_KERNELS_HPP
#define KOKKOS_BLAS_KERNELS_HPP
namespace Kokkos {
template< class ConstVectorType ,
class Device = typename ConstVectorType::execution_space >
struct Dot ;
template< class ConstVectorType ,
class Device = typename ConstVectorType::execution_space >
struct DotSingle ;
template< class ConstScalarType ,
class VectorType ,
class Device = typename VectorType::execution_space >
struct Scale ;
template< class ConstScalarType ,
class ConstVectorType ,
class VectorType ,
class Device = typename VectorType::execution_space >
struct AXPBY ;
/** \brief Y = alpha * X + beta * Y */
template< class ConstScalarType ,
class ConstVectorType ,
class VectorType >
void axpby( const ConstScalarType & alpha ,
const ConstVectorType & X ,
const ConstScalarType & beta ,
const VectorType & Y )
{
typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ;
parallel_for( Y.dimension_0() , functor( alpha , X , beta , Y ) );
}
/** \brief Y *= alpha */
template< class ConstScalarType ,
class VectorType >
void scale( const ConstScalarType & alpha , const VectorType & Y )
{
typedef Scale< ConstScalarType , VectorType > functor ;
parallel_for( Y.dimension_0() , functor( alpha , Y ) );
}
template< class ConstVectorType ,
class Finalize >
void dot( const ConstVectorType & X ,
const ConstVectorType & Y ,
const Finalize & finalize )
{
typedef Dot< ConstVectorType > functor ;
parallel_reduce( X.dimension_0() , functor( X , Y ) , finalize );
}
template< class ConstVectorType ,
class Finalize >
void dot( const ConstVectorType & X ,
const Finalize & finalize )
{
typedef DotSingle< ConstVectorType > functor ;
parallel_reduce( X.dimension_0() , functor( X ) , finalize );
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< class Type , class Device >
struct Dot
{
typedef typename Device::execution_space execution_space ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< Type::Rank > >::type ok_rank ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename Type::execution_space >::type ok_device ;*/
typedef double value_type ;
#if 1
typename Type::const_type X ;
typename Type::const_type Y ;
#else
Type X ;
Type Y ;
#endif
Dot( const Type & arg_x , const Type & arg_y )
: X(arg_x) , Y(arg_y) { }
KOKKOS_INLINE_FUNCTION
void operator()( int i , value_type & update ) const
{ update += X[i] * Y[i]; }
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
const volatile value_type & source )
{ update += source; }
KOKKOS_INLINE_FUNCTION
static void init( value_type & update )
{ update = 0 ; }
};
template< class Type , class Device >
struct DotSingle
{
typedef typename Device::execution_space execution_space ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< Type::Rank > >::type ok_rank ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename Type::execution_space >::type ok_device ;*/
typedef double value_type ;
#if 1
typename Type::const_type X ;
#else
Type X ;
#endif
DotSingle( const Type & arg_x ) : X(arg_x) {}
KOKKOS_INLINE_FUNCTION
void operator()( int i , value_type & update ) const
{
const typename Type::value_type & x = X[i]; update += x * x ;
}
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
const volatile value_type & source )
{ update += source; }
KOKKOS_INLINE_FUNCTION
static void init( value_type & update )
{ update = 0 ; }
};
template< class ScalarType , class VectorType , class Device>
struct Scale
{
typedef typename Device::execution_space execution_space ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename ScalarType::execution_space >::type
ok_scalar_device ;
typedef typename
Impl::StaticAssertSame< execution_space ,
typename VectorType::execution_space >::type
ok_vector_device ;*/
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
Impl::unsigned_< ScalarType::Rank > >::type
ok_scalar_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< VectorType::Rank > >::type
ok_vector_rank ;
#if 1
typename ScalarType::const_type alpha ;
#else
ScalarType alpha ;
#endif
VectorType Y ;
Scale( const ScalarType & arg_alpha , const VectorType & arg_Y )
: alpha( arg_alpha ), Y( arg_Y ) {}
KOKKOS_INLINE_FUNCTION
void operator()( int i ) const
{
Y[i] *= alpha() ;
}
};
template< class ScalarType ,
class ConstVectorType ,
class VectorType,
class Device>
struct AXPBY
{
typedef typename Device::execution_space execution_space ;
/* typedef typename
Impl::StaticAssertSame< execution_space ,
typename ScalarType::execution_space >::type
ok_scalar_device ;
typedef typename
Impl::StaticAssertSame< execution_space ,
typename ConstVectorType::execution_space >::type
ok_const_vector_device ;
typedef typename
Impl::StaticAssertSame< execution_space ,
typename VectorType::execution_space >::type
ok_vector_device ;*/
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
Impl::unsigned_< ScalarType::Rank > >::type
ok_scalar_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< ConstVectorType::Rank > >::type
ok_const_vector_rank ;
typedef typename
Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
Impl::unsigned_< VectorType::Rank > >::type
ok_vector_rank ;
#if 1
typename ScalarType::const_type alpha , beta ;
typename ConstVectorType::const_type X ;
#else
ScalarType alpha , beta ;
ConstVectorType X ;
#endif
VectorType Y ;
AXPBY( const ScalarType & arg_alpha ,
const ConstVectorType & arg_X ,
const ScalarType & arg_beta ,
const VectorType & arg_Y )
: alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {}
KOKKOS_INLINE_FUNCTION
void operator()( int i ) const
{
Y[i] = alpha() * X[i] + beta() * Y[i] ;
}
};
} /* namespace Kokkos */
#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */

View File

@ -0,0 +1,189 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <iostream>
#include <iomanip>
#include <algorithm>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_CUDA )
#include <impl/Kokkos_Timer.hpp>
#include <PerfTestHexGrad.hpp>
#include <PerfTestBlasKernels.hpp>
#include <PerfTestGramSchmidt.hpp>
#include <PerfTestDriver.hpp>
namespace Test {
class cuda : public ::testing::Test {
protected:
static void SetUpTestCase() {
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
}
static void TearDownTestCase() {
Kokkos::Cuda::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
TEST_F( cuda, hexgrad )
{
EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
}
TEST_F( cuda, gramschmidt )
{
EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
}
namespace {
template <typename T>
struct TextureFetch
{
typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type;
typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type;
typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type;
typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type;
struct FillArray
{
array_type m_array;
FillArray( const array_type & array )
: m_array(array)
{}
void apply() const
{
Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this);
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const { m_array(i) = i; }
};
struct RandomIndexes
{
index_array_type m_indexes;
typename index_array_type::HostMirror m_host_indexes;
RandomIndexes( const index_array_type & indexes)
: m_indexes(indexes)
, m_host_indexes(Kokkos::create_mirror(m_indexes))
{}
void apply() const
{
Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.size()), *this);
//random shuffle
Kokkos::HostSpace::execution_space::fence();
std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.size());
Kokkos::deep_copy(m_indexes,m_host_indexes);
}
KOKKOS_INLINE_FUNCTION
void operator()(int i) const { m_host_indexes(i) = i; }
};
struct RandomReduce
{
const_array_type m_array;
const_index_array_type m_indexes;
RandomReduce( const const_array_type & array, const const_index_array_type & indexes)
: m_array(array)
, m_indexes(indexes)
{}
void apply(T & reduce) const
{
Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this, reduce);
}
KOKKOS_INLINE_FUNCTION
void operator()(int i, T & reduce) const
{ reduce += m_array(m_indexes(i)); }
};
static void run(int size, double & reduce_time, T &reduce)
{
array_type array("array",size);
index_array_type indexes("indexes",size);
{ FillArray f(array); f.apply(); }
{ RandomIndexes f(indexes); f.apply(); }
Kokkos::Cuda::fence();
Kokkos::Impl::Timer timer;
for (int j=0; j<10; ++j) {
RandomReduce f(array,indexes);
f.apply(reduce);
}
Kokkos::Cuda::fence();
reduce_time = timer.seconds();
}
};
} // unnamed namespace
TEST_F( cuda, texture_double )
{
printf("Random reduce of double through texture fetch\n");
for (int i=1; i<=27; ++i) {
int size = 1<<i;
double time = 0;
double reduce = 0;
TextureFetch<double>::run(size,time,reduce);
printf(" time = %1.3e size = 2^%d\n", time, i);
}
}
} // namespace Test
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */

View File

@ -0,0 +1,152 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <iostream>
#include <string>
// mfh 06 Jun 2013: This macro doesn't work like one might thing it
// should. It doesn't take the template parameter DeviceType and
// print its actual type name; it just literally prints out
// "DeviceType". I've worked around this below without using the
// macro, so I'm commenting out the macro to avoid compiler complaints
// about an unused macro.
// #define KOKKOS_MACRO_IMPL_TO_STRING( X ) #X
// #define KOKKOS_MACRO_TO_STRING( X ) KOKKOS_MACRO_IMPL_TO_STRING( X )
//------------------------------------------------------------------------
namespace Test {
enum { NUMBER_OF_TRIALS = 5 };
template< class DeviceType >
void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] )
{
std::string label_hexgrad ;
label_hexgrad.append( "\"HexGrad< double , " );
// mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
// the string, not the actual name of the device type. Thus, I've
// modified the function to take the name of the device type.
//
//label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
label_hexgrad.append( deviceTypeName );
label_hexgrad.append( " >\"" );
for (int i = exp_beg ; i < exp_end ; ++i) {
double min_seconds = 0.0 ;
double max_seconds = 0.0 ;
double avg_seconds = 0.0 ;
const int parallel_work_length = 1<<i;
for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
if ( 0 == j ) {
min_seconds = seconds ;
max_seconds = seconds ;
}
else {
if ( seconds < min_seconds ) min_seconds = seconds ;
if ( seconds > max_seconds ) max_seconds = seconds ;
}
avg_seconds += seconds ;
}
avg_seconds /= NUMBER_OF_TRIALS ;
std::cout << label_hexgrad
<< " , " << parallel_work_length
<< " , " << min_seconds
<< " , " << ( min_seconds / parallel_work_length )
<< std::endl ;
}
}
template< class DeviceType >
void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] )
{
std::string label_gramschmidt ;
label_gramschmidt.append( "\"GramSchmidt< double , " );
// mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
// the string, not the actual name of the device type. Thus, I've
// modified the function to take the name of the device type.
//
//label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
label_gramschmidt.append( deviceTypeName );
label_gramschmidt.append( " >\"" );
for (int i = exp_beg ; i < exp_end ; ++i) {
double min_seconds = 0.0 ;
double max_seconds = 0.0 ;
double avg_seconds = 0.0 ;
const int parallel_work_length = 1<<i;
for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
if ( 0 == j ) {
min_seconds = seconds ;
max_seconds = seconds ;
}
else {
if ( seconds < min_seconds ) min_seconds = seconds ;
if ( seconds > max_seconds ) max_seconds = seconds ;
}
avg_seconds += seconds ;
}
avg_seconds /= NUMBER_OF_TRIALS ;
std::cout << label_gramschmidt
<< " , " << parallel_work_length
<< " , " << min_seconds
<< " , " << ( min_seconds / parallel_work_length )
<< std::endl ;
}
}
}

View File

@ -0,0 +1,231 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cmath>
#include <PerfTestBlasKernels.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Test {
// Reduction : result = dot( Q(:,j) , Q(:,j) );
// PostProcess : R(j,j) = result ; inv = 1 / result ;
template< class VectorView , class ValueView >
struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ;
ValueView Rjj ;
ValueView inv ;
InvNorm2( const VectorView & argX ,
const ValueView & argR ,
const ValueView & argInv )
: Kokkos::DotSingle< VectorView >( argX )
, Rjj( argR )
, inv( argInv )
{}
KOKKOS_INLINE_FUNCTION
void final( value_type & result ) const
{
result = sqrt( result );
Rjj() = result ;
inv() = ( 0 < result ) ? 1.0 / result : 0 ;
}
};
template< class VectorView , class ValueView >
inline
void invnorm2( const VectorView & x ,
const ValueView & r ,
const ValueView & r_inv )
{
Kokkos::parallel_reduce( x.dimension_0() , InvNorm2< VectorView , ValueView >( x , r , r_inv ) );
}
// PostProcess : tmp = - ( R(j,k) = result );
template< class VectorView , class ValueView >
struct DotM : public Kokkos::Dot< VectorView > {
typedef typename Kokkos::Dot< VectorView >::value_type value_type ;
ValueView Rjk ;
ValueView tmp ;
DotM( const VectorView & argX ,
const VectorView & argY ,
const ValueView & argR ,
const ValueView & argTmp )
: Kokkos::Dot< VectorView >( argX , argY )
, Rjk( argR )
, tmp( argTmp )
{}
KOKKOS_INLINE_FUNCTION
void final( value_type & result ) const
{
Rjk() = result ;
tmp() = - result ;
}
};
template< class VectorView , class ValueView >
inline
void dot_neg( const VectorView & x ,
const VectorView & y ,
const ValueView & r ,
const ValueView & r_neg )
{
Kokkos::parallel_reduce( x.dimension_0() , DotM< VectorView , ValueView >( x , y , r , r_neg ) );
}
template< typename Scalar , class DeviceType >
struct ModifiedGramSchmidt
{
typedef DeviceType execution_space ;
typedef typename execution_space::size_type size_type ;
typedef Kokkos::View< Scalar** ,
Kokkos::LayoutLeft ,
execution_space > multivector_type ;
typedef Kokkos::View< Scalar* ,
Kokkos::LayoutLeft ,
execution_space > vector_type ;
typedef Kokkos::View< Scalar ,
Kokkos::LayoutLeft ,
execution_space > value_view ;
multivector_type Q ;
multivector_type R ;
static double factorization( const multivector_type Q_ ,
const multivector_type R_ )
{
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
using Kokkos::Experimental::ALL ;
#else
const Kokkos::ALL ALL ;
#endif
const size_type count = Q_.dimension_1();
value_view tmp("tmp");
value_view one("one");
Kokkos::deep_copy( one , (Scalar) 1 );
Kokkos::Impl::Timer timer ;
for ( size_type j = 0 ; j < count ; ++j ) {
// Reduction : tmp = dot( Q(:,j) , Q(:,j) );
// PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
const vector_type Qj = Kokkos::subview( Q_ , ALL , j );
const value_view Rjj = Kokkos::subview( R_ , j , j );
invnorm2( Qj , Rjj , tmp );
// Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
Kokkos::scale( tmp , Qj );
for ( size_t k = j + 1 ; k < count ; ++k ) {
const vector_type Qk = Kokkos::subview( Q_ , ALL , k );
const value_view Rjk = Kokkos::subview( R_ , j , k );
// Reduction : R(j,k) = dot( Q(:,j) , Q(:,k) );
// PostProcess : tmp = - R(j,k);
dot_neg( Qj , Qk , Rjk , tmp );
// Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
Kokkos::axpby( tmp , Qj , one , Qk );
}
}
execution_space::fence();
return timer.seconds();
}
//--------------------------------------------------------------------------
static double test( const size_t length ,
const size_t count ,
const size_t iter = 1 )
{
multivector_type Q_( "Q" , length , count );
multivector_type R_( "R" , count , count );
typename multivector_type::HostMirror A =
Kokkos::create_mirror( Q_ );
// Create and fill A on the host
for ( size_type j = 0 ; j < count ; ++j ) {
for ( size_type i = 0 ; i < length ; ++i ) {
A(i,j) = ( i + 1 ) * ( j + 1 );
}
}
double dt_min = 0 ;
for ( size_t i = 0 ; i < iter ; ++i ) {
Kokkos::deep_copy( Q_ , A );
// A = Q * R
const double dt = factorization( Q_ , R_ );
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
}
return dt_min ;
}
};
}

View File

@ -0,0 +1,268 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
namespace Test {
template< class DeviceType ,
typename CoordScalarType = double ,
typename GradScalarType = float >
struct HexGrad
{
typedef DeviceType execution_space ;
typedef typename execution_space::size_type size_type ;
typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type;
// 3D array : ( ParallelWork , Space , Node )
enum { NSpace = 3 , NNode = 8 };
typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space >
elem_coord_type ;
typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space >
elem_grad_type ;
elem_coord_type coords ;
elem_grad_type grad_op ;
enum { FLOPS = 318 }; // = 3 * ( 18 + 8 * 11 ) };
enum { READS = 18 };
enum { WRITES = 18 };
HexGrad( const elem_coord_type & arg_coords ,
const elem_grad_type & arg_grad_op )
: coords( arg_coords )
, grad_op( arg_grad_op )
{}
KOKKOS_INLINE_FUNCTION static
void grad( const CoordScalarType x[] ,
const CoordScalarType z[] ,
GradScalarType grad_y[] )
{
const GradScalarType R42=(x[3] - x[1]);
const GradScalarType R52=(x[4] - x[1]);
const GradScalarType R54=(x[4] - x[3]);
const GradScalarType R63=(x[5] - x[2]);
const GradScalarType R83=(x[7] - x[2]);
const GradScalarType R86=(x[7] - x[5]);
const GradScalarType R31=(x[2] - x[0]);
const GradScalarType R61=(x[5] - x[0]);
const GradScalarType R74=(x[6] - x[3]);
const GradScalarType R72=(x[6] - x[1]);
const GradScalarType R75=(x[6] - x[4]);
const GradScalarType R81=(x[7] - x[0]);
const GradScalarType t1=(R63 + R54);
const GradScalarType t2=(R61 + R74);
const GradScalarType t3=(R72 + R81);
const GradScalarType t4 =(R86 + R42);
const GradScalarType t5 =(R83 + R52);
const GradScalarType t6 =(R75 + R31);
// Calculate Y gradient from X and Z data
grad_y[0] = (z[1] * t1) - (z[2] * R42) - (z[3] * t5) + (z[4] * t4) + (z[5] * R52) - (z[7] * R54);
grad_y[1] = (z[2] * t2) + (z[3] * R31) - (z[0] * t1) - (z[5] * t6) + (z[6] * R63) - (z[4] * R61);
grad_y[2] = (z[3] * t3) + (z[0] * R42) - (z[1] * t2) - (z[6] * t4) + (z[7] * R74) - (z[5] * R72);
grad_y[3] = (z[0] * t5) - (z[1] * R31) - (z[2] * t3) + (z[7] * t6) + (z[4] * R81) - (z[6] * R83);
grad_y[4] = (z[5] * t3) + (z[6] * R86) - (z[7] * t2) - (z[0] * t4) - (z[3] * R81) + (z[1] * R61);
grad_y[5] = (z[6] * t5) - (z[4] * t3) - (z[7] * R75) + (z[1] * t6) - (z[0] * R52) + (z[2] * R72);
grad_y[6] = (z[7] * t1) - (z[5] * t5) - (z[4] * R86) + (z[2] * t4) - (z[1] * R63) + (z[3] * R83);
grad_y[7] = (z[4] * t2) - (z[6] * t1) + (z[5] * R75) - (z[3] * t6) - (z[2] * R74) + (z[0] * R54);
}
KOKKOS_INLINE_FUNCTION
void operator()( size_type ielem ) const
{
GradScalarType g[NNode] ;
const CoordScalarType x[NNode] = {
coords(ielem,0,0),
coords(ielem,0,1),
coords(ielem,0,2),
coords(ielem,0,3),
coords(ielem,0,4),
coords(ielem,0,5),
coords(ielem,0,6),
coords(ielem,0,7)
};
const CoordScalarType y[NNode] = {
coords(ielem,1,0),
coords(ielem,1,1),
coords(ielem,1,2),
coords(ielem,1,3),
coords(ielem,1,4),
coords(ielem,1,5),
coords(ielem,1,6),
coords(ielem,1,7)
};
const CoordScalarType z[NNode] = {
coords(ielem,2,0),
coords(ielem,2,1),
coords(ielem,2,2),
coords(ielem,2,3),
coords(ielem,2,4),
coords(ielem,2,5),
coords(ielem,2,6),
coords(ielem,2,7)
};
grad( z , y , g );
grad_op(ielem,0,0) = g[0];
grad_op(ielem,0,1) = g[1];
grad_op(ielem,0,2) = g[2];
grad_op(ielem,0,3) = g[3];
grad_op(ielem,0,4) = g[4];
grad_op(ielem,0,5) = g[5];
grad_op(ielem,0,6) = g[6];
grad_op(ielem,0,7) = g[7];
grad( x , z , g );
grad_op(ielem,1,0) = g[0];
grad_op(ielem,1,1) = g[1];
grad_op(ielem,1,2) = g[2];
grad_op(ielem,1,3) = g[3];
grad_op(ielem,1,4) = g[4];
grad_op(ielem,1,5) = g[5];
grad_op(ielem,1,6) = g[6];
grad_op(ielem,1,7) = g[7];
grad( y , x , g );
grad_op(ielem,2,0) = g[0];
grad_op(ielem,2,1) = g[1];
grad_op(ielem,2,2) = g[2];
grad_op(ielem,2,3) = g[3];
grad_op(ielem,2,4) = g[4];
grad_op(ielem,2,5) = g[5];
grad_op(ielem,2,6) = g[6];
grad_op(ielem,2,7) = g[7];
}
//--------------------------------------------------------------------------
struct Init {
typedef typename self_type::execution_space execution_space ;
elem_coord_type coords ;
Init( const elem_coord_type & arg_coords )
: coords( arg_coords ) {}
KOKKOS_INLINE_FUNCTION
void operator()( size_type ielem ) const
{
coords(ielem,0,0) = 0.;
coords(ielem,1,0) = 0.;
coords(ielem,2,0) = 0.;
coords(ielem,0,1) = 1.;
coords(ielem,1,1) = 0.;
coords(ielem,2,1) = 0.;
coords(ielem,0,2) = 1.;
coords(ielem,1,2) = 1.;
coords(ielem,2,2) = 0.;
coords(ielem,0,3) = 0.;
coords(ielem,1,3) = 1.;
coords(ielem,2,3) = 0.;
coords(ielem,0,4) = 0.;
coords(ielem,1,4) = 0.;
coords(ielem,2,4) = 1.;
coords(ielem,0,5) = 1.;
coords(ielem,1,5) = 0.;
coords(ielem,2,5) = 1.;
coords(ielem,0,6) = 1.;
coords(ielem,1,6) = 1.;
coords(ielem,2,6) = 1.;
coords(ielem,0,7) = 0.;
coords(ielem,1,7) = 1.;
coords(ielem,2,7) = 1.;
}
};
//--------------------------------------------------------------------------
static double test( const int count , const int iter = 1 )
{
elem_coord_type coord( "coord" , count );
elem_grad_type grad ( "grad" , count );
// Execute the parallel kernels on the arrays:
double dt_min = 0 ;
Kokkos::parallel_for( count , Init( coord ) );
execution_space::fence();
for ( int i = 0 ; i < iter ; ++i ) {
Kokkos::Impl::Timer timer ;
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
execution_space::fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
}
return dt_min ;
}
};
}

View File

@ -0,0 +1,104 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_OPENMP )
typedef Kokkos::OpenMP TestHostDevice ;
const char TestHostDeviceName[] = "Kokkos::OpenMP" ;
#elif defined( KOKKOS_HAVE_PTHREAD )
typedef Kokkos::Threads TestHostDevice ;
const char TestHostDeviceName[] = "Kokkos::Threads" ;
#elif defined( KOKKOS_HAVE_SERIAL )
typedef Kokkos::Serial TestHostDevice ;
const char TestHostDeviceName[] = "Kokkos::Serial" ;
#else
# error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
#endif
#include <impl/Kokkos_Timer.hpp>
#include <PerfTestHexGrad.hpp>
#include <PerfTestBlasKernels.hpp>
#include <PerfTestGramSchmidt.hpp>
#include <PerfTestDriver.hpp>
//------------------------------------------------------------------------
namespace Test {
class host : public ::testing::Test {
protected:
static void SetUpTestCase()
{
const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
const unsigned threads_per_team = 4 ;
TestHostDevice::initialize( team_count * threads_per_team );
}
static void TearDownTestCase()
{
TestHostDevice::finalize();
}
};
TEST_F( host, hexgrad ) {
EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
}
TEST_F( host, gramschmidt ) {
EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName ));
}
} // namespace Test

View File

@ -0,0 +1,49 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <gtest/gtest.h>
int main(int argc, char *argv[]) {
::testing::InitGoogleTest(&argc,argv);
return RUN_ALL_TESTS();
}

View File

@ -0,0 +1,504 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Timer.hpp>
typedef Kokkos::DefaultExecutionSpace exec_space;
#define RESET 0
#define BRIGHT 1
#define DIM 2
#define UNDERLINE 3
#define BLINK 4
#define REVERSE 7
#define HIDDEN 8
#define BLACK 0
#define RED 1
#define GREEN 2
#define YELLOW 3
#define BLUE 4
#define MAGENTA 5
#define CYAN 6
#define GREY 7
#define WHITE 8
void textcolor(int attr, int fg, int bg)
{ char command[13];
/* Command is the control command to the terminal */
sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
printf("%s", command);
}
void textcolor_standard() {textcolor(RESET, BLACK, WHITE);}
template<class T,class DEVICE_TYPE>
struct ZeroFunctor{
typedef DEVICE_TYPE execution_space;
typedef typename Kokkos::View<T,execution_space> type;
typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
data() = 0;
}
};
//---------------------------------------------------
//--------------atomic_fetch_add---------------------
//---------------------------------------------------
template<class T,class DEVICE_TYPE>
struct AddFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
Kokkos::atomic_fetch_add(&data(),(T)1);
}
};
template<class T>
T AddLoop(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct AddFunctor<T,exec_space> f_add;
f_add.data = data;
Kokkos::parallel_for(loop,f_add);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
return val;
}
template<class T,class DEVICE_TYPE>
struct AddNonAtomicFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
data()+=(T)1;
}
};
template<class T>
T AddLoopNonAtomic(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct AddNonAtomicFunctor<T,exec_space> f_add;
f_add.data = data;
Kokkos::parallel_for(loop,f_add);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
return val;
}
template<class T>
T AddLoopSerial(int loop) {
T* data = new T[1];
data[0] = 0;
for(int i=0;i<loop;i++)
*data+=(T)1;
T val = *data;
delete data;
return val;
}
template<class T,class DEVICE_TYPE>
struct CASFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
T old = data();
T newval, assumed;
do {
assumed = old;
newval = assumed + (T)1;
old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
}
while( old != assumed );
}
};
template<class T>
T CASLoop(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct CASFunctor<T,exec_space> f_cas;
f_cas.data = data;
Kokkos::parallel_for(loop,f_cas);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
return val;
}
template<class T,class DEVICE_TYPE>
struct CASNonAtomicFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
volatile T assumed;
volatile T newval;
bool fail=1;
do {
assumed = data();
newval = assumed + (T)1;
if(data()==assumed) {
data() = newval;
fail = 0;
}
}
while(fail);
}
};
template<class T>
T CASLoopNonAtomic(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct CASNonAtomicFunctor<T,exec_space> f_cas;
f_cas.data = data;
Kokkos::parallel_for(loop,f_cas);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
T val = h_data();
return val;
}
template<class T>
T CASLoopSerial(int loop) {
T* data = new T[1];
data[0] = 0;
for(int i=0;i<loop;i++) {
T assumed;
T newval;
T old;
do {
assumed = *data;
newval = assumed + (T)1;
old = *data;
*data = newval;
}
while(!(assumed==old));
}
T val = *data;
delete data;
return val;
}
template<class T,class DEVICE_TYPE>
struct ExchFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data, data2;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
T old = Kokkos::atomic_exchange(&data(),(T)i);
Kokkos::atomic_fetch_add(&data2(),old);
}
};
template<class T>
T ExchLoop(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
typename ZeroFunctor<T,exec_space>::type data2("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
f_zero.data = data2;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct ExchFunctor<T,exec_space> f_exch;
f_exch.data = data;
f_exch.data2 = data2;
Kokkos::parallel_for(loop,f_exch);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
Kokkos::deep_copy(h_data2,data2);
T val = h_data() + h_data2();
return val;
}
template<class T,class DEVICE_TYPE>
struct ExchNonAtomicFunctor{
typedef DEVICE_TYPE execution_space;
typedef Kokkos::View<T,execution_space> type;
type data, data2;
KOKKOS_INLINE_FUNCTION
void operator()(int i) const {
T old = data();
data()=(T) i;
data2()+=old;
}
};
template<class T>
T ExchLoopNonAtomic(int loop) {
struct ZeroFunctor<T,exec_space> f_zero;
typename ZeroFunctor<T,exec_space>::type data("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
f_zero.data = data;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
typename ZeroFunctor<T,exec_space>::type data2("Data");
typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
f_zero.data = data2;
Kokkos::parallel_for(1,f_zero);
exec_space::fence();
struct ExchNonAtomicFunctor<T,exec_space> f_exch;
f_exch.data = data;
f_exch.data2 = data2;
Kokkos::parallel_for(loop,f_exch);
exec_space::fence();
Kokkos::deep_copy(h_data,data);
Kokkos::deep_copy(h_data2,data2);
T val = h_data() + h_data2();
return val;
}
template<class T>
T ExchLoopSerial(int loop) {
T* data = new T[1];
T* data2 = new T[1];
data[0] = 0;
data2[0] = 0;
for(int i=0;i<loop;i++) {
T old = *data;
*data=(T) i;
*data2+=old;
}
T val = *data2 + *data;
delete data;
delete data2;
return val;
}
template<class T>
T LoopVariant(int loop, int test) {
switch (test) {
case 1: return AddLoop<T>(loop);
case 2: return CASLoop<T>(loop);
case 3: return ExchLoop<T>(loop);
}
return 0;
}
template<class T>
T LoopVariantSerial(int loop, int test) {
switch (test) {
case 1: return AddLoopSerial<T>(loop);
case 2: return CASLoopSerial<T>(loop);
case 3: return ExchLoopSerial<T>(loop);
}
return 0;
}
template<class T>
T LoopVariantNonAtomic(int loop, int test) {
switch (test) {
case 1: return AddLoopNonAtomic<T>(loop);
case 2: return CASLoopNonAtomic<T>(loop);
case 3: return ExchLoopNonAtomic<T>(loop);
}
return 0;
}
template<class T>
void Loop(int loop, int test, const char* type_name) {
LoopVariant<T>(loop,test);
Kokkos::Impl::Timer timer;
T res = LoopVariant<T>(loop,test);
double time1 = timer.seconds();
timer.reset();
T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
double time2 = timer.seconds();
timer.reset();
T resSerial = LoopVariantSerial<T>(loop,test);
double time3 = timer.seconds();
time1*=1e6/loop;
time2*=1e6/loop;
time3*=1e6/loop;
//textcolor_standard();
bool passed = true;
if(resSerial!=res) passed = false;
//if(!passed) textcolor(RESET,BLACK,YELLOW);
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
//if(!passed) textcolor_standard();
printf("\n");
}
template<class T>
void Test(int loop, int test, const char* type_name) {
if(test==-1) {
Loop<T>(loop,1,type_name);
Loop<T>(loop,2,type_name);
Loop<T>(loop,3,type_name);
}
else
Loop<T>(loop,test,type_name);
}
int main(int argc, char* argv[])
{
int type = -1;
int loop = 1000000;
int test = -1;
for(int i=0;i<argc;i++)
{
if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
}
Kokkos::initialize(argc,argv);
printf("Using %s\n",Kokkos::atomic_query_version());
bool all_tests = false;
if(type==-1) all_tests = true;
while(type<100) {
if(type==1) {
Test<int>(loop,test,"int ");
}
if(type==2) {
Test<long int>(loop,test,"long int ");
}
if(type==3) {
Test<long long int>(loop,test,"long long int ");
}
if(type==4) {
Test<unsigned int>(loop,test,"unsigned int ");
}
if(type==5) {
Test<unsigned long int>(loop,test,"unsigned long int ");
}
if(type==6) {
Test<unsigned long long int>(loop,test,"unsigned long long int ");
}
if(type==10) {
//Test<float>(loop,test,"float ");
}
if(type==11) {
Test<double>(loop,test,"double ");
}
if(!all_tests) type=100;
else type++;
}
Kokkos::finalize();
}

View File

@ -0,0 +1,283 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( KOKKOS_HAVE_CUDA )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
//----------------------------------------------------------------------------
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename ValueType , typename AliasType >
struct CudaTextureFetch {
::cudaTextureObject_t m_obj ;
const ValueType * m_ptr ;
int m_offset ;
// Deference operator pulls through texture object and returns by value
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_ptr[ i ];
#endif
}
// Pointer to referenced memory
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( CudaTextureFetch && rhs )
: m_obj( rhs.m_obj )
, m_ptr( rhs.m_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
{
m_obj = rhs.m_obj ;
m_ptr = rhs.m_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
// Texture object spans the entire allocation.
// This handle may view a subset of the allocation, so an offset is required.
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
)
// 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300
: m_obj( record.template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
{}
};
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
template< typename ValueType , typename AliasType >
struct CudaLDGFetch {
const ValueType * m_ptr ;
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
KOKKOS_INLINE_FUNCTION
CudaLDGFetch() : m_ptr() {}
KOKKOS_INLINE_FUNCTION
~CudaLDGFetch() {}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( const CudaLDGFetch & rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch( CudaLDGFetch && rhs )
: m_ptr( rhs.m_ptr )
{}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
{
m_ptr = rhs.m_ptr ;
return *this ;
}
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
)
: m_ptr( arg_data_ptr )
{}
};
#endif
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class Traits >
class ViewDataHandle< Traits ,
typename std::enable_if<(
// Is Cuda memory space
( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
&&
// Is a trivial const value of 4, 8, or 16 bytes
std::is_trivial<typename Traits::const_value_type>::value
&&
std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
&&
( sizeof(typename Traits::const_value_type) == 4 ||
sizeof(typename Traits::const_value_type) == 8 ||
sizeof(typename Traits::const_value_type) == 16 )
&&
// Random access trait
( Traits::memory_traits::RandomAccess != 0 )
)>::type >
{
public:
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
using value_type = typename Traits::const_value_type ;
using return_type = typename Traits::const_value_type ; // NOT a reference
using alias_type = typename std::conditional< ( sizeof(value_type) == 4 ) , int ,
typename std::conditional< ( sizeof(value_type) == 8 ) , ::int2 ,
typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
>::type
>::type
>::type ;
#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
#else
using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
#endif
KOKKOS_INLINE_FUNCTION
static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
{
return arg_handle ;
}
KOKKOS_INLINE_FUNCTION
static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
// if called in a host execution space
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
#else
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
return handle_type();
#endif
}
};
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -0,0 +1,277 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDAEXEC_HPP
#define KOKKOS_CUDAEXEC_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <string>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct CudaTraits {
enum { WarpSize = 32 /* 0x0020 */ };
enum { WarpIndexMask = 0x001f /* Mask for warpindex */ };
enum { WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
enum { UpperBoundGridCount = 65535 /* Hard upper bound */ };
enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
enum { ConstantMemoryUsage = 0x008000 /* 32k bytes */ };
enum { ConstantMemoryCache = 0x002000 /* 8k bytes */ };
typedef unsigned long
ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_count( CudaSpace::size_type i )
{ return ( i + WarpIndexMask ) >> WarpIndexShift ; }
KOKKOS_INLINE_FUNCTION static
CudaSpace::size_type warp_align( CudaSpace::size_type i )
{
enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
return ( i + WarpIndexMask ) & Mask ;
}
};
//----------------------------------------------------------------------------
CudaSpace::size_type cuda_internal_maximum_warp_count();
CudaSpace::size_type cuda_internal_maximum_grid_count();
CudaSpace::size_type cuda_internal_maximum_shared_words();
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __CUDACC__ )
/** \brief Access to constant memory on the device */
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
extern
#endif
__device__ __constant__
Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
kokkos_impl_cuda_constant_memory_buffer ;
__device__ __constant__
int* kokkos_impl_cuda_atomic_lock_array ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
namespace Kokkos {
namespace Impl {
__device__ inline
bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
}
__device__ inline
void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr);
offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
}
}
}
template< typename T >
inline
__device__
T * kokkos_impl_cuda_shared_memory()
{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize L1 cache and minimize shared memory:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
// For 2.0 capability: 48 KB L1 and 16 KB shared
//----------------------------------------------------------------------------
template< class DriverType >
__global__
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
*((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
driver();
}
template< class DriverType >
__global__
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
}
template < class DriverType ,
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType >
struct CudaParallelLaunch< DriverType , true > {
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
}
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
Kokkos::Cuda::fence();
CUDA_SAFE_CALL( cudaGetLastError() );
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType , false > {
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 );
}
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
Kokkos::Cuda::fence();
CUDA_SAFE_CALL( cudaGetLastError() );
#endif
}
}
};
//----------------------------------------------------------------------------
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */

View File

@ -0,0 +1,670 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <Kokkos_Cuda.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
DeepCopy<CudaSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
DeepCopy<CudaSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
DeepCopy<HostSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
DeepCopy<HostSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
DeepCopy<CudaSpace,HostSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
enum { TEXTURE_BOUND_1D = 2u << 27 };
if ( tracker.attribute() == NULL ) {
// check for correct allocator
const bool ok_alloc = tracker.allocator()->support_texture_binding();
const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
if (ok_alloc && ok_count) {
Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
tracker.set_attribute( attr );
}
else {
std::ostringstream oss;
oss << "Error: Cannot attach texture object";
if (!ok_alloc) {
oss << ", incompatabile allocator " << tracker.allocator()->name();
}
if (!ok_count) {
oss << ", array " << tracker.label() << " too large";
}
oss << ".";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
std::ostringstream oss;
oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
} // unnamed namespace
/*--------------------------------------------------------------------------*/
Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
void CudaSpace::access_error()
{
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
Kokkos::Impl::throw_runtime_exception( msg );
}
void CudaSpace::access_error( const void * const )
{
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
Kokkos::Impl::throw_runtime_exception( msg );
}
/*--------------------------------------------------------------------------*/
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
bool CudaUVMSpace::available()
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
enum { UVM_available = true };
#else
enum { UVM_available = false };
#endif
return UVM_available;
}
/*--------------------------------------------------------------------------*/
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
CudaSpace::CudaSpace()
: m_device( Kokkos::Cuda().cuda_device() )
{
}
CudaUVMSpace::CudaUVMSpace()
: m_device( Kokkos::Cuda().cuda_device() )
{
}
CudaHostPinnedSpace::CudaHostPinnedSpace()
{
}
void * CudaSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) );
return ptr ;
}
void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
return ptr ;
}
void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );
return ptr ;
}
void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
} catch(...) {}
}
void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
} catch(...) {}
}
void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) );
} catch(...) {}
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ;
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ;
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ;
::cudaTextureObject_t
SharedAllocationRecord< Kokkos::CudaSpace , void >::
attach_texture_object( const unsigned sizeof_alias
, void * const alloc_ptr
, size_t const alloc_size )
{
// Only valid for 300 <= __CUDA_ARCH__
// otherwise return zero.
::cudaTextureObject_t tex_obj ;
struct cudaResourceDesc resDesc ;
struct cudaTextureDesc texDesc ;
memset( & resDesc , 0 , sizeof(resDesc) );
memset( & texDesc , 0 , sizeof(texDesc) );
resDesc.resType = cudaResourceTypeLinear ;
resDesc.res.linear.desc = ( sizeof_alias == 4 ? cudaCreateChannelDesc< int >() :
( sizeof_alias == 8 ? cudaCreateChannelDesc< ::int2 >() :
/* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) );
resDesc.res.linear.sizeInBytes = alloc_size ;
resDesc.res.linear.devPtr = alloc_ptr ;
CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) );
return tex_obj ;
}
std::string
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const
{
SharedAllocationHeader header ;
Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
return std::string( header.m_label );
}
std::string
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const
{
return std::string( RecordBase::head()->m_label );
}
std::string
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const
{
return std::string( RecordBase::head()->m_label );
}
SharedAllocationRecord< Kokkos::CudaSpace , void > *
SharedAllocationRecord< Kokkos::CudaSpace , void >::
allocate( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
}
SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
allocate( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
}
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
allocate( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
}
void
SharedAllocationRecord< Kokkos::CudaSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
void
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
void
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
SharedAllocationRecord< Kokkos::CudaSpace , void >::
~SharedAllocationRecord()
{
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
~SharedAllocationRecord()
{
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
~SharedAllocationRecord()
{
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::CudaSpace , void >::
SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_tex_obj( 0 )
, m_space( arg_space )
{
SharedAllocationHeader header ;
// Fill in the Header information
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
strncpy( header.m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
// Copy to device memory
Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
}
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_tex_obj( 0 )
, m_space( arg_space )
{
// Fill in the Header information, directly accessible via UVM
RecordBase::m_alloc_ptr->m_record = this ;
strncpy( RecordBase::m_alloc_ptr->m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
}
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_space( arg_space )
{
// Fill in the Header information, directly accessible via UVM
RecordBase::m_alloc_ptr->m_record = this ;
strncpy( RecordBase::m_alloc_ptr->m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
}
SharedAllocationRecord< Kokkos::CudaSpace , void > *
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordBase = SharedAllocationRecord< void , void > ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
#if 0
// Copy the header from the allocation
SharedAllocationHeader head ;
SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr );
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
RecordCuda * const record = static_cast< RecordCuda * >( head.m_record );
if ( record->m_alloc_ptr != head_cuda ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
}
#else
// Iterate the list to search for the record among all allocations
// requires obtaining the root of the list and then locking the list.
RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
if ( record == 0 ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
}
#endif
return record ;
}
SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
if ( h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
}
return static_cast< RecordCuda * >( h->m_record );
}
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr )
{
using Header = SharedAllocationHeader ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
if ( h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
}
return static_cast< RecordCuda * >( h->m_record );
}
// Iterate records to print orphaned memory ...
void
SharedAllocationRecord< Kokkos::CudaSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
{
SharedAllocationRecord< void , void > * r = & s_root_record ;
char buffer[256] ;
SharedAllocationHeader head ;
if ( detail ) {
do {
if ( r->m_alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
}
else {
head.m_label[0] = 0 ;
}
snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
, reinterpret_cast<unsigned long>( r )
, reinterpret_cast<unsigned long>( r->m_prev )
, reinterpret_cast<unsigned long>( r->m_next )
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
, r->m_alloc_size
, r->m_count
, reinterpret_cast<unsigned long>( r->m_dealloc )
, head.m_label
);
std::cout << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
else {
do {
if ( r->m_alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n"
, reinterpret_cast< unsigned long >( r->data() )
, r->size()
, head.m_label
);
}
else {
snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
}
std::cout << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
}
void
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
}
void
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
__global__ void init_lock_array_kernel() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1)
kokkos_impl_cuda_atomic_lock_array[i] = 0;
}
}
namespace Impl {
int* lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
return ptr;
}
void init_lock_array_cuda_space() {
int is_initialized = 0;
if(! is_initialized) {
int* lock_array_ptr = lock_array_cuda_space_ptr();
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
}
}
}
}
#endif // KOKKOS_HAVE_CUDA

View File

@ -0,0 +1,183 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos {
namespace Impl {
template< class DestructFunctor >
SharedAllocationRecord *
shared_allocation_record( Kokkos::CudaSpace const & arg_space
, void * const arg_alloc_ptr
, DestructFunctor const & arg_destruct )
{
SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr );
// assert: record != 0
// assert: sizeof(DestructFunctor) <= record->m_destruct_size
// assert: record->m_destruct_function == 0
DestructFunctor * const functor =
reinterpret_cast< DestructFunctor * >(
reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) );
new( functor ) DestructFunctor( arg_destruct );
record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
return record ;
}
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedAllocator
{
static const char * name()
{
return "Cuda Unmanaged Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedUVMAllocator
{
static const char * name()
{
return "Cuda Unmanaged UVM Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedHostAllocator
/// does nothing when deallocate(ptr,size) is called
class CudaUnmanagedHostAllocator
{
public:
static const char * name()
{
return "Cuda Unmanaged Host Allocator";
}
// Unmanaged deallocate does nothing
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
};
/// class CudaMallocAllocator
class CudaMallocAllocator
{
public:
static const char * name()
{
return "Cuda Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaUVMAllocator
class CudaUVMAllocator
{
public:
static const char * name()
{
return "Cuda UVM Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaHostAllocator
class CudaHostAllocator
{
public:
static const char * name()
{
return "Cuda Host Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP

View File

@ -0,0 +1,192 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
TextureAttribute::TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
)
: m_tex_obj(0)
{
cuda_device_synchronize();
struct cudaResourceDesc resDesc ;
struct cudaTextureDesc texDesc ;
memset( & resDesc , 0 , sizeof(resDesc) );
memset( & texDesc , 0 , sizeof(texDesc) );
resDesc.resType = cudaResourceTypeLinear ;
resDesc.res.linear.desc = desc ;
resDesc.res.linear.sizeInBytes = alloc_size ;
resDesc.res.linear.devPtr = alloc_ptr ;
CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
cuda_device_synchronize();
}
TextureAttribute::~TextureAttribute()
{
if (m_tex_obj) {
cudaDestroyTextureObject( m_tex_obj );
}
}
/*--------------------------------------------------------------------------*/
void * CudaMallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
return ptr;
}
void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaUVMAllocator::allocate( size_t size )
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
return ptr;
#else
throw_runtime_exception( "CUDA VERSION does not support UVM" );
return NULL;
#endif
}
void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaHostAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
return ptr;
}
void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
} catch(...) {}
}
void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA

View File

@ -0,0 +1,187 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos { namespace Impl {
// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
// to be an 'unsigned long long'. This chould change with
// future version of Cuda and this typedef would have to
// change accordingly.
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
typedef enable_if<
sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
::cudaTextureObject_t >::type cuda_texture_object_type ;
#else
typedef const void * cuda_texture_object_type ;
#endif
struct TextureAttribute : public AllocatorAttributeBase
{
cuda_texture_object_type m_tex_obj ;
TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
);
~TextureAttribute();
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedAllocator
{
static const char * name()
{
return "Cuda Unmanaged Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedUVMAllocator
{
static const char * name()
{
return "Cuda Unmanaged UVM Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedHostAllocator
/// does nothing when deallocate(ptr,size) is called
class CudaUnmanagedHostAllocator
{
public:
static const char * name()
{
return "Cuda Unmanaged Host Allocator";
}
// Unmanaged deallocate does nothing
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
};
/// class CudaMallocAllocator
class CudaMallocAllocator
{
public:
static const char * name()
{
return "Cuda Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaUVMAllocator
class CudaUVMAllocator
{
public:
static const char * name()
{
return "Cuda UVM Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaHostAllocator
class CudaHostAllocator
{
public:
static const char * name()
{
return "Cuda Host Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP

View File

@ -0,0 +1,69 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_ERROR_HPP
#define KOKKOS_CUDA_ERROR_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
namespace Kokkos { namespace Impl {
void cuda_device_synchronize();
void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
{
if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
}
#define CUDA_SAFE_CALL( call ) \
Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif //KOKKOS_CUDA_ERROR_HPP

View File

@ -0,0 +1,678 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/*--------------------------------------------------------------------------*/
/* Kokkos interfaces */
#include <Kokkos_Core.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp>
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
#include <stdlib.h>
/* Standard 'C++' libraries */
#include <vector>
#include <iostream>
#include <sstream>
#include <string>
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
__device__ __constant__
Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
kokkos_impl_cuda_constant_memory_buffer ;
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace {
__global__
void query_cuda_kernel_arch( int * d_arch )
{
#if defined( __CUDA_ARCH__ )
*d_arch = __CUDA_ARCH__ ;
#else
*d_arch = 0 ;
#endif
}
/** Query what compute capability is actually launched to the device: */
int cuda_kernel_arch()
{
int * d_arch = 0 ;
cudaMalloc( (void **) & d_arch , sizeof(int) );
query_cuda_kernel_arch<<<1,1>>>( d_arch );
int arch = 0 ;
cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
cudaFree( d_arch );
return arch ;
}
bool cuda_launch_blocking()
{
const char * env = getenv("CUDA_LAUNCH_BLOCKING");
if (env == 0) return false;
return atoi(env);
}
}
void cuda_device_synchronize()
{
// static const bool launch_blocking = cuda_launch_blocking();
// if (!launch_blocking) {
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
// }
}
void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
{
std::ostringstream out ;
out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e);
if (file) {
out << " " << file << ":" << line;
}
throw_runtime_exception( out.str() );
}
//----------------------------------------------------------------------------
// Some significant cuda device properties:
//
// cudaDeviceProp::name : Text label for device
// cudaDeviceProp::major : Device major number
// cudaDeviceProp::minor : Device minor number
// cudaDeviceProp::warpSize : number of threads per warp
// cudaDeviceProp::multiProcessorCount : number of multiprocessors
// cudaDeviceProp::sharedMemPerBlock : capacity of shared memory per block
// cudaDeviceProp::totalConstMem : capacity of constant memory
// cudaDeviceProp::totalGlobalMem : capacity of global memory
// cudaDeviceProp::maxGridSize[3] : maximum grid size
//
// Section 4.4.2.4 of the CUDA Toolkit Reference Manual
//
// struct cudaDeviceProp {
// char name[256];
// size_t totalGlobalMem;
// size_t sharedMemPerBlock;
// int regsPerBlock;
// int warpSize;
// size_t memPitch;
// int maxThreadsPerBlock;
// int maxThreadsDim[3];
// int maxGridSize[3];
// size_t totalConstMem;
// int major;
// int minor;
// int clockRate;
// size_t textureAlignment;
// int deviceOverlap;
// int multiProcessorCount;
// int kernelExecTimeoutEnabled;
// int integrated;
// int canMapHostMemory;
// int computeMode;
// int concurrentKernels;
// int ECCEnabled;
// int pciBusID;
// int pciDeviceID;
// int tccDriver;
// int asyncEngineCount;
// int unifiedAddressing;
// int memoryClockRate;
// int memoryBusWidth;
// int l2CacheSize;
// int maxThreadsPerMultiProcessor;
// };
namespace {
class CudaInternalDevices {
public:
enum { MAXIMUM_DEVICE_COUNT = 8 };
struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
int m_cudaDevCount ;
CudaInternalDevices();
static const CudaInternalDevices & singleton();
};
CudaInternalDevices::CudaInternalDevices()
{
// See 'cudaSetDeviceFlags' for host-device thread interaction
// Section 4.4.2.6 of the CUDA Toolkit Reference Manual
CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
}
}
const CudaInternalDevices & CudaInternalDevices::singleton()
{
static CudaInternalDevices self ; return self ;
}
}
//----------------------------------------------------------------------------
class CudaInternal {
private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
AllocationTracker m_scratchFlagsTracker;
AllocationTracker m_scratchSpaceTracker;
AllocationTracker m_scratchUnifiedTracker;
public:
typedef Cuda::size_type size_type ;
int m_cudaDev ;
int m_cudaArch ;
unsigned m_maxWarpCount ;
unsigned m_maxBlock ;
unsigned m_maxSharedWords ;
size_type m_scratchSpaceCount ;
size_type m_scratchFlagsCount ;
size_type m_scratchUnifiedCount ;
size_type m_scratchUnifiedSupported ;
size_type m_streamCount ;
size_type * m_scratchSpace ;
size_type * m_scratchFlags ;
size_type * m_scratchUnified ;
cudaStream_t * m_stream ;
static CudaInternal & singleton();
int verify_is_initialized( const char * const label ) const ;
int is_initialized() const
{ return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
void initialize( int cuda_device_id , int stream_count );
void finalize();
void print_configuration( std::ostream & ) const ;
~CudaInternal();
CudaInternal()
: m_cudaDev( -1 )
, m_cudaArch( -1 )
, m_maxWarpCount( 0 )
, m_maxBlock( 0 )
, m_maxSharedWords( 0 )
, m_scratchSpaceCount( 0 )
, m_scratchFlagsCount( 0 )
, m_scratchUnifiedCount( 0 )
, m_scratchUnifiedSupported( 0 )
, m_streamCount( 0 )
, m_scratchSpace( 0 )
, m_scratchFlags( 0 )
, m_scratchUnified( 0 )
, m_stream( 0 )
{}
size_type * scratch_space( const size_type size );
size_type * scratch_flags( const size_type size );
size_type * scratch_unified( const size_type size );
};
//----------------------------------------------------------------------------
void CudaInternal::print_configuration( std::ostream & s ) const
{
const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
#if defined( KOKKOS_HAVE_CUDA )
s << "macro KOKKOS_HAVE_CUDA : defined" << std::endl ;
#endif
#if defined( CUDA_VERSION )
s << "macro CUDA_VERSION = " << CUDA_VERSION
<< " = version " << CUDA_VERSION / 1000
<< "." << ( CUDA_VERSION % 1000 ) / 10
<< std::endl ;
#endif
for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
s << "Kokkos::Cuda[ " << i << " ] "
<< dev_info.m_cudaProp[i].name
<< " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
<< ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
<< ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
if ( m_cudaDev == i ) s << " : Selected" ;
s << std::endl ;
}
}
//----------------------------------------------------------------------------
CudaInternal::~CudaInternal()
{
if ( m_stream ||
m_scratchSpace ||
m_scratchFlags ||
m_scratchUnified ) {
std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
<< std::endl ;
std::cerr.flush();
}
m_cudaDev = -1 ;
m_cudaArch = -1 ;
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchUnifiedCount = 0 ;
m_scratchUnifiedSupported = 0 ;
m_streamCount = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_stream = 0 ;
}
int CudaInternal::verify_is_initialized( const char * const label ) const
{
if ( m_cudaDev < 0 ) {
std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
}
return 0 <= m_cudaDev ;
}
CudaInternal & CudaInternal::singleton()
{
static CudaInternal self ;
return self ;
}
void CudaInternal::initialize( int cuda_device_id , int stream_count )
{
enum { WordSize = sizeof(size_type) };
if ( ! HostSpace::execution_space::is_initialized() ) {
const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
throw_runtime_exception( msg );
}
const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
const bool ok_id = 0 <= cuda_device_id &&
cuda_device_id < dev_info.m_cudaDevCount ;
// Need device capability 2.0 or better
const bool ok_dev = ok_id &&
( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
if ( ok_init && ok_dev ) {
const struct cudaDeviceProp & cudaProp =
dev_info.m_cudaProp[ cuda_device_id ];
m_cudaDev = cuda_device_id ;
CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
CUDA_SAFE_CALL( cudaDeviceReset() );
Kokkos::Impl::cuda_device_synchronize();
// Query what compute capability architecture a kernel executes:
m_cudaArch = cuda_kernel_arch();
if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
<< ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
<< " on device with compute capability "
<< cudaProp.major << "." << cudaProp.minor
<< " , this will likely reduce potential performance."
<< std::endl ;
}
//----------------------------------
// Maximum number of warps,
// at most one warp per thread in a warp for reduction.
// HCE 2012-February :
// Found bug in CUDA 4.1 that sometimes a kernel launch would fail
// if the thread count == 1024 and a functor is passed to the kernel.
// Copying the kernel to constant memory and then launching with
// thread count == 1024 would work fine.
//
// HCE 2012-October :
// All compute capabilities support at least 16 warps (512 threads).
// However, we have found that 8 warps typically gives better performance.
m_maxWarpCount = 8 ;
// m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
m_maxWarpCount = Impl::CudaTraits::WarpSize ;
}
m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
//----------------------------------
// Maximum number of blocks:
m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
//----------------------------------
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
if ( ! m_scratchUnifiedSupported ) {
std::cout << "Kokkos::Cuda device "
<< cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
<< " does not support unified virtual address space"
<< std::endl ;
}
//----------------------------------
// Multiblock reduction uses scratch flags for counters
// and scratch space for partial reduction values.
// Allocate some initial space. This will grow as needed.
{
const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
(void) scratch_unified( 16 * sizeof(size_type) );
(void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) );
(void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
}
//----------------------------------
if ( stream_count ) {
m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
m_streamCount = stream_count ;
for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
}
}
else {
std::ostringstream msg ;
msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
if ( ! ok_init ) {
msg << " : Already initialized" ;
}
if ( ! ok_id ) {
msg << " : Device identifier out of range "
<< "[0.." << dev_info.m_cudaDevCount << "]" ;
}
else if ( ! ok_dev ) {
msg << " : Device " ;
msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
msg << "." ;
msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
msg << " has insufficient capability, required 2.0 or better" ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_cuda_space();
}
//----------------------------------------------------------------------------
typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
enum { sizeScratchGrain = sizeof(ScratchGrain) };
Cuda::size_type *
CudaInternal::scratch_flags( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
}
return m_scratchFlags ;
}
Cuda::size_type *
CudaInternal::scratch_space( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
}
return m_scratchSpace ;
}
Cuda::size_type *
CudaInternal::scratch_unified( const Cuda::size_type size )
{
if ( verify_is_initialized("scratch_unified") &&
m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
}
return m_scratchUnified ;
}
//----------------------------------------------------------------------------
void CudaInternal::finalize()
{
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
lock_array_cuda_space_ptr(true);
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
cudaStreamDestroy( m_stream[i] );
m_stream[i] = 0 ;
}
::free( m_stream );
}
m_scratchSpaceTracker.clear();
m_scratchFlagsTracker.clear();
m_scratchUnifiedTracker.clear();
m_cudaDev = -1 ;
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;
m_maxSharedWords = 0 ;
m_scratchSpaceCount = 0 ;
m_scratchFlagsCount = 0 ;
m_scratchUnifiedCount = 0 ;
m_streamCount = 0 ;
m_scratchSpace = 0 ;
m_scratchFlags = 0 ;
m_scratchUnified = 0 ;
m_stream = 0 ;
}
}
//----------------------------------------------------------------------------
Cuda::size_type cuda_internal_maximum_warp_count()
{ return CudaInternal::singleton().m_maxWarpCount ; }
Cuda::size_type cuda_internal_maximum_grid_count()
{ return CudaInternal::singleton().m_maxBlock ; }
Cuda::size_type cuda_internal_maximum_shared_words()
{ return CudaInternal::singleton().m_maxSharedWords ; }
Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_space( size ); }
Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_flags( size ); }
Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
{ return CudaInternal::singleton().scratch_unified( size ); }
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
Cuda::size_type Cuda::detect_device_count()
{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); }
void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
std::vector<unsigned>
Cuda::detect_device_arch()
{
const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
std::vector<unsigned> output( s.m_cudaDevCount );
for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
}
return output ;
}
Cuda::size_type Cuda::device_arch()
{
const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
int dev_arch = 0 ;
if ( 0 <= dev_id ) {
const struct cudaDeviceProp & cudaProp =
Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
dev_arch = cudaProp.major * 100 + cudaProp.minor ;
}
return dev_arch ;
}
void Cuda::finalize()
{ Impl::CudaInternal::singleton().finalize(); }
Cuda::Cuda()
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
, m_stream( 0 )
{
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
}
Cuda::Cuda( const int instance_id )
: m_device( Impl::CudaInternal::singleton().m_cudaDev )
, m_stream(
Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
: 0 )
{}
void Cuda::print_configuration( std::ostream & s , const bool )
{ Impl::CudaInternal::singleton().print_configuration( s ); }
bool Cuda::sleep() { return false ; }
bool Cuda::wake() { return true ; }
void Cuda::fence()
{
Kokkos::Impl::cuda_device_synchronize();
}
} // namespace Kokkos
#endif // KOKKOS_HAVE_CUDA
//----------------------------------------------------------------------------

View File

@ -0,0 +1,165 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_INTERNAL_HPP
#define KOKKOS_CUDA_INTERNAL_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <Cuda/Kokkos_Cuda_Error.hpp>
namespace Kokkos { namespace Impl {
template<class DriverType>
int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
int numBlocks;
if(Large) {
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
}
if(numBlocks>0) return blockSize;
else return blockSize/2;
} else {
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
}
if(numBlocks>0) return blockSize;
else return blockSize/2;
}
#endif
}
template<class DriverType>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
if(Large) {
while(blockSize<1024) {
blockSize*=2;
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
if(maxOccupancy < numBlocks*blockSize) {
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
}
}
} else {
while(blockSize<1024) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
blockSize,
sharedmem);
if(maxOccupancy < numBlocks*blockSize) {
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
}
}
}
return bestBlockSize;
#endif
}
}} // namespace Kokkos::Impl
#endif // KOKKOS_HAVE_CUDA
#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,424 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
#define KOKKOS_CUDA_REDUCESCAN_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//Shfl based reductions
/*
* Algorithmic constraints:
* (a) threads with same threadIdx.y have same value
* (b) blockDim.x == power of two
* (c) blockDim.z == 1
*/
template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_warp_reduction( ValueType& result,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
unsigned int shift = 1;
//Reduce over values from threads with different threadIdx.y
while(blockDim.x * shift < 32 ) {
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
//Only join if upper thread is active (this allows non power of two for blockDim.y
if(threadIdx.y + shift < max_active_thread)
join(result , tmp);
shift*=2;
}
result = shfl(result,0,32);
}
template< class ValueType , class JoinOp>
__device__
inline void cuda_inter_warp_reduction( ValueType& value,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
#define STEP_WIDTH 4
__shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
ValueType* result = (ValueType*) & sh_result;
const unsigned step = 32 / blockDim.x;
unsigned shift = STEP_WIDTH;
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
if(id < STEP_WIDTH ) {
result[id] = value;
}
__syncthreads();
while (shift<=max_active_thread/step) {
if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
join(result[id%STEP_WIDTH],value);
}
__syncthreads();
shift+=STEP_WIDTH;
}
value = result[0];
for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
join(value,result[i]);
}
template< class ValueType , class JoinOp>
__device__
inline void cuda_intra_block_reduction( ValueType& value,
const JoinOp& join,
const int max_active_thread = blockDim.y) {
cuda_intra_warp_reduction(value,join,max_active_thread);
cuda_inter_warp_reduction(value,join,max_active_thread);
}
template< class FunctorType , class JoinOp>
__device__
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value,
const JoinOp& join,
Cuda::size_type * const m_scratch_space,
typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
Cuda::size_type * const m_scratch_flags,
const int max_active_thread = blockDim.y) {
typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
//Do the intra-block reduction with shfl operations and static shared memory
cuda_intra_block_reduction(value,join,max_active_thread);
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
//One thread in the block writes block result to global scratch_memory
if(id == 0 ) {
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
*global = value;
}
//One warp of last block performs inter block reduction through loading the block values from global scratch_memory
bool last_block = false;
__syncthreads();
if ( id < 32 ) {
Cuda::size_type count;
//Figure out whether this is the last block
if(id == 0)
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
count = Kokkos::shfl(count,0,32);
//Last block does the inter block reduction
if( count == gridDim.x - 1) {
//set flag back to zero
if(id == 0)
*m_scratch_flags = 0;
last_block = true;
value = 0;
pointer_type const volatile global = (pointer_type) m_scratch_space ;
//Reduce all global values with splitting work over threads in one warp
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
for(int i=id; i<gridDim.x; i+=step_size) {
value_type tmp = global[i];
join(value, tmp);
}
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
if (blockDim.x*blockDim.y > 1) {
value_type tmp = Kokkos::shfl_down(value, 1,32);
if( id + 1 < gridDim.x )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < gridDim.x )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < gridDim.x )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < gridDim.x )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < gridDim.x )
join(value, tmp);
}
}
}
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
}
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
// __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
// function qualifier which could be used to improve performance.
//----------------------------------------------------------------------------
// Maximize shared memory and minimize L1 cache:
// cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
// For 2.0 capability: 48 KB shared and 16 KB L1
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) blockDim.y is a power of two
* (b) blockDim.y <= 512
* (c) blockDim.x == blockDim.z == 1
*/
template< bool DoScan , class FunctorType , class ArgTag >
__device__
void cuda_intra_block_reduce_scan( const FunctorType & functor ,
const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
{
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
const unsigned value_count = ValueTraits::value_count( functor );
const unsigned BlockSizeMask = blockDim.y - 1 ;
// Must have power of two thread count
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
#define BLOCK_REDUCE_STEP( R , TD , S ) \
if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
#define BLOCK_SCAN_STEP( TD , N , S ) \
if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
const unsigned rtid_intra = threadIdx.y ^ BlockSizeMask ;
const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
{ // Intra-warp reduction:
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
}
__syncthreads(); // Wait for all warps to reduce
{ // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
if ( rtid_inter < blockDim.y ) {
const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
if ( (1<<5) < BlockSizeMask ) { BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
if ( DoScan ) {
int n = ( rtid_inter & 32 ) ? 32 : (
( rtid_inter & 64 ) ? 64 : (
( rtid_inter & 128 ) ? 128 : (
( rtid_inter & 256 ) ? 256 : 0 )));
if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
BLOCK_SCAN_STEP(tdata_inter,n,8)
BLOCK_SCAN_STEP(tdata_inter,n,7)
BLOCK_SCAN_STEP(tdata_inter,n,6)
BLOCK_SCAN_STEP(tdata_inter,n,5)
}
}
}
__syncthreads(); // Wait for inter-warp reduce-scan to complete
if ( DoScan ) {
int n = ( rtid_intra & 1 ) ? 1 : (
( rtid_intra & 2 ) ? 2 : (
( rtid_intra & 4 ) ? 4 : (
( rtid_intra & 8 ) ? 8 : (
( rtid_intra & 16 ) ? 16 : 0 ))));
if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
BLOCK_SCAN_STEP(tdata_intra,n,0)
}
#undef BLOCK_SCAN_STEP
#undef BLOCK_REDUCE_STEP
}
//----------------------------------------------------------------------------
/**\brief Input value-per-thread starting at 'shared_data'.
* Reduction value at last thread's location.
*
* If 'DoScan' then write blocks' scan values and block-groups' scan values.
*
* Global reduce result is in the last threads' 'shared_data' location.
*/
template< bool DoScan , class FunctorType , class ArgTag >
__device__
bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
const Cuda::size_type block_id ,
const Cuda::size_type block_count ,
Cuda::size_type * const shared_data ,
Cuda::size_type * const global_data ,
Cuda::size_type * const global_flags )
{
typedef Cuda::size_type size_type ;
typedef FunctorValueTraits< FunctorType , ArgTag > ValueTraits ;
typedef FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef FunctorValueOps< FunctorType , ArgTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const unsigned BlockSizeMask = blockDim.y - 1 ;
const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
// Must have power of two thread count
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
// Reduce the accumulation for the entire block.
cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
{
// Write accumulation total to global scratch space.
// Accumulation total is the last thread's data.
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
}
// Contributing blocks note that their contribution has been completed via an atomic-increment flag
// If this block is not the last block to contribute to this group then the block is done.
const bool is_last_block =
! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
if ( is_last_block ) {
const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
{
void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
reference_type shared_value = ValueInit::init( functor , shared_ptr );
for ( size_type i = b ; i < e ; ++i ) {
ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
}
}
cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
if ( DoScan ) {
size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
// Join previous inclusive scan value to each member
for ( size_type i = b ; i < e ; ++i ) {
size_type * const global_value = global_data + word_count.value * i ;
ValueJoin::join( functor , shared_value , global_value );
ValueOps ::copy( functor , global_value , shared_value );
}
}
}
return is_last_block ;
}
// Size in bytes required for inter block reduce or scan
template< bool DoScan , class FunctorType , class ArgTag >
inline
unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
{
return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( __CUDACC__ ) */
#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */

View File

@ -0,0 +1,298 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
#define KOKKOS_CUDA_VECTORIZATION_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <Kokkos_Cuda.hpp>
namespace Kokkos {
// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
// or other GPUs. We provide a generic definition (which is trivial
// and doesn't do what it claims to do) because we don't actually use
// this function unless we are on a suitable GPU, with a suitable
// Scalar type. (For example, in the mat-vec, the "ThreadsPerRow"
// internal parameter depends both on the ExecutionSpace and the Scalar type,
// and it controls whether shfl_down() gets called.)
namespace Impl {
template< typename Scalar >
struct shfl_union {
enum {n = sizeof(Scalar)/4};
float fval[n];
KOKKOS_INLINE_FUNCTION
Scalar value() {
return *(Scalar*) fval;
}
KOKKOS_INLINE_FUNCTION
void operator= (Scalar& value_) {
float* const val_ptr = (float*) &value_;
for(int i=0; i<n ; i++) {
fval[i] = val_ptr[i];
}
}
KOKKOS_INLINE_FUNCTION
void operator= (const Scalar& value_) {
float* const val_ptr = (float*) &value_;
for(int i=0; i<n ; i++) {
fval[i] = val_ptr[i];
}
}
};
}
#ifdef __CUDA_ARCH__
#if (__CUDA_ARCH__ >= 300)
KOKKOS_INLINE_FUNCTION
int shfl(const int &val, const int& srcLane, const int& width ) {
return __shfl(val,srcLane,width);
}
KOKKOS_INLINE_FUNCTION
float shfl(const float &val, const int& srcLane, const int& width ) {
return __shfl(val,srcLane,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl(tmp,srcLane,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
double shfl(const double &val, const int& srcLane, const int& width) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl(lo,srcLane,width);
hi = __shfl(hi,srcLane,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl(lo,srcLane,width);
hi = __shfl(hi,srcLane,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
return r_val.value();
}
KOKKOS_INLINE_FUNCTION
int shfl_down(const int &val, const int& delta, const int& width) {
return __shfl_down(val,delta,width);
}
KOKKOS_INLINE_FUNCTION
float shfl_down(const float &val, const int& delta, const int& width) {
return __shfl_down(val,delta,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl_down(tmp,delta,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
double shfl_down(const double &val, const int& delta, const int& width) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_down(lo,delta,width);
hi = __shfl_down(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
return r_val.value();
}
KOKKOS_INLINE_FUNCTION
int shfl_up(const int &val, const int& delta, const int& width ) {
return __shfl_up(val,delta,width);
}
KOKKOS_INLINE_FUNCTION
float shfl_up(const float &val, const int& delta, const int& width ) {
return __shfl_up(val,delta,width);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
Scalar tmp1 = val;
float tmp = *reinterpret_cast<float*>(&tmp1);
tmp = __shfl_up(tmp,delta,width);
return *reinterpret_cast<Scalar*>(&tmp);
}
KOKKOS_INLINE_FUNCTION
double shfl_up(const double &val, const int& delta, const int& width ) {
int lo = __double2loint(val);
int hi = __double2hiint(val);
lo = __shfl_up(lo,delta,width);
hi = __shfl_up(hi,delta,width);
return __hiloint2double(hi,lo);
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
int lo = __double2loint(*reinterpret_cast<const double*>(&val));
int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
lo = __shfl_up(lo,delta,width);
hi = __shfl_up(hi,delta,width);
const double tmp = __hiloint2double(hi,lo);
return *(reinterpret_cast<const Scalar*>(&tmp));
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
Impl::shfl_union<Scalar> s_val;
Impl::shfl_union<Scalar> r_val;
s_val = val;
for(int i = 0; i<s_val.n; i++)
r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
return r_val.value();
}
#else
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
return val;
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
return val;
}
template<typename Scalar>
KOKKOS_INLINE_FUNCTION
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
return val;
}
#endif
#else
template<typename Scalar>
inline
Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
return val;
}
template<typename Scalar>
inline
Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
return val;
}
template<typename Scalar>
inline
Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
return val;
}
#endif
}
#endif // KOKKOS_HAVE_CUDA
#endif

View File

@ -0,0 +1,312 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_VIEW_HPP
#define KOKKOS_CUDA_VIEW_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <cstring>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_View.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
struct AssertShapeBoundsAbort< CudaSpace >
{
KOKKOS_INLINE_FUNCTION
static void apply( const size_t /* rank */ ,
const size_t /* n0 */ , const size_t /* n1 */ ,
const size_t /* n2 */ , const size_t /* n3 */ ,
const size_t /* n4 */ , const size_t /* n5 */ ,
const size_t /* n6 */ , const size_t /* n7 */ ,
const size_t /* arg_rank */ ,
const size_t /* i0 */ , const size_t /* i1 */ ,
const size_t /* i2 */ , const size_t /* i3 */ ,
const size_t /* i4 */ , const size_t /* i5 */ ,
const size_t /* i6 */ , const size_t /* i7 */ )
{
Kokkos::abort("Kokkos::View array bounds violation");
}
};
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename ValueType
, class MemorySpace
, class AliasType =
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 , void
>::type
>::type
>::type
>
class CudaTextureFetch {
private:
cuda_texture_object_type m_obj ;
const ValueType * m_alloc_ptr ;
int m_offset ;
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
{
typedef char const * const byte;
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
const size_t count = tracker.alloc_size() / sizeof(ValueType);
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
if (ok_aligned && ok_contains) {
if (tracker.attribute() == NULL ) {
MemorySpace::texture_object_attach(
tracker
, sizeof(ValueType)
, cudaCreateChannelDesc< AliasType >()
);
}
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
m_offset = arg_ptr - m_alloc_ptr;
}
else if( !ok_contains ) {
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
}
else {
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
}
}
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_alloc_ptr( rhs.m_alloc_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_alloc_ptr = rhs.m_alloc_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION explicit
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
{
#if defined( KOKKOS_USE_LDG_INTRINSIC )
m_alloc_ptr(arg_ptr);
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
if ( arg_ptr != NULL ) {
if ( tracker.is_valid() ) {
attach( arg_ptr, tracker );
}
else {
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
if ( found_tracker.is_valid() ) {
attach( arg_ptr, found_tracker );
} else {
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
}
}
}
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_alloc_ptr[ i + m_offset ];
#endif
}
};
template< typename ValueType, class MemorySpace >
class CudaTextureFetch< const ValueType, MemorySpace, void >
{
private:
const ValueType * m_ptr ;
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_ptr(0) {};
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
m_ptr = rhs.m_ptr;
return *this ;
}
explicit KOKKOS_INLINE_FUNCTION
CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
m_ptr = base_view_ptr;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
m_ptr = base_view_ptr;
return *this;
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
return m_ptr[ i ];
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class ViewTraits >
class ViewDataHandle< ViewTraits ,
typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
&&
is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
&&
ViewTraits::memory_traits::RandomAccess
>::type >
{
public:
enum { ReturnTypeIsReference = false };
typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
, typename ViewTraits::memory_space> handle_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
{
return handle_type(arg_data_ptr, arg_tracker);
}
typedef typename ViewTraits::value_type return_type;
};
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif // KOKKOS_HAVE_CUDA
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -0,0 +1,119 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_ABORT_HPP
#define KOKKOS_CUDA_ABORT_HPP
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include "Kokkos_Macros.hpp"
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
#include <cuda.h>
#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
#error "Cuda version 4.1 or greater required"
#endif
#if ( __CUDA_ARCH__ < 200 )
#error "Cuda device capability 2.0 or greater required"
#endif
extern "C" {
/* Cuda runtime function, declared in <crt/device_runtime.h>
* Requires capability 2.x or better.
*/
extern __device__ void __assertfail(
const void *message,
const void *file,
unsigned int line,
const void *function,
size_t charsize);
}
namespace Kokkos {
namespace Impl {
__device__ inline
void cuda_abort( const char * const message )
{
#ifndef __APPLE__
const char empty[] = "" ;
__assertfail( (const void *) message ,
(const void *) empty ,
(unsigned int) 0 ,
(const void *) empty ,
sizeof(char) );
#endif
}
} // namespace Impl
} // namespace Kokkos
#else
namespace Kokkos {
namespace Impl {
KOKKOS_INLINE_FUNCTION
void cuda_abort( const char * const ) {}
}
}
#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
namespace Kokkos {
__device__ inline
void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
}
#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,285 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Atomic.hpp
/// \brief Atomic functions
///
/// This header file defines prototypes for the following atomic functions:
/// - exchange
/// - compare and exchange
/// - add
///
/// Supported types include:
/// - signed and unsigned 4 and 8 byte integers
/// - float
/// - double
///
/// They are implemented through GCC compatible intrinsics, OpenMP
/// directives and native CUDA intrinsics.
///
/// Including this header file requires one of the following
/// compilers:
/// - NVCC (for CUDA device code only)
/// - GCC (for host code only)
/// - Intel (for host code only)
/// - A compiler that supports OpenMP 3.1 (for host code only)
#ifndef KOKKOS_ATOMIC_HPP
#define KOKKOS_ATOMIC_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_Traits.hpp>
//----------------------------------------------------------------------------
#if defined(_WIN32)
#define KOKKOS_ATOMICS_USE_WINDOWS
#else
#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
// Compiling NVIDIA device code, must use Cuda atomics:
#define KOKKOS_ATOMICS_USE_CUDA
#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
! defined( KOKKOS_ATOMICS_USE_OMP31 )
// Compiling for non-Cuda atomic implementation has not been pre-selected.
// Choose the best implementation for the detected compiler.
// Preference: GCC, INTEL, OMP31
#if defined( KOKKOS_COMPILER_GNU ) || \
defined( KOKKOS_COMPILER_CLANG ) || \
( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
#define KOKKOS_ATOMICS_USE_GCC
#elif defined( KOKKOS_COMPILER_INTEL ) || \
defined( KOKKOS_COMPILER_CRAYC )
#define KOKKOS_ATOMICS_USE_INTEL
#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
#define KOKKOS_ATOMICS_USE_OMP31
#else
#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
#endif
#endif /* Not pre-selected atomic implementation */
#endif
//----------------------------------------------------------------------------
// Forward decalaration of functions supporting arbitrary sized atomics
// This is necessary since Kokkos_Atomic.hpp is internally included very early
// through Kokkos_HostSpace.hpp as well as the allocation tracker.
#ifdef KOKKOS_HAVE_CUDA
namespace Kokkos {
namespace Impl {
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
__device__ inline
bool lock_address_cuda_space(void* ptr);
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
__device__ inline
void unlock_address_cuda_space(void* ptr);
}
}
#endif
namespace Kokkos {
template <typename T>
KOKKOS_INLINE_FUNCTION
void atomic_add(volatile T * const dest, const T src);
// Atomic increment
template<typename T>
KOKKOS_INLINE_FUNCTION
void atomic_increment(volatile T* a);
template<typename T>
KOKKOS_INLINE_FUNCTION
void atomic_decrement(volatile T* a);
}
#if ! defined(_WIN32)
#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
#endif
namespace Kokkos {
inline
const char * atomic_query_version()
{
#if defined( KOKKOS_ATOMICS_USE_CUDA )
return "KOKKOS_ATOMICS_USE_CUDA" ;
#elif defined( KOKKOS_ATOMICS_USE_GCC )
return "KOKKOS_ATOMICS_USE_GCC" ;
#elif defined( KOKKOS_ATOMICS_USE_INTEL )
return "KOKKOS_ATOMICS_USE_INTEL" ;
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
return "KOKKOS_ATOMICS_USE_OMP31" ;
#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
return "KOKKOS_ATOMICS_USE_WINDOWS";
#endif
}
} // namespace Kokkos
#ifdef _WIN32
#include "impl/Kokkos_Atomic_Windows.hpp"
#else
//#include "impl/Kokkos_Atomic_Assembly_X86.hpp"
//----------------------------------------------------------------------------
// Atomic exchange
//
// template< typename T >
// T atomic_exchange( volatile T* const dest , const T val )
// { T tmp = *dest ; *dest = val ; return tmp ; }
#include "impl/Kokkos_Atomic_Exchange.hpp"
//----------------------------------------------------------------------------
// Atomic compare-and-exchange
//
// template<class T>
// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and add
//
// template<class T>
// T atomic_fetch_add(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest += val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and sub
//
// template<class T>
// T atomic_fetch_sub(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest -= val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and or
//
// template<class T>
// T atomic_fetch_or(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
//----------------------------------------------------------------------------
// Atomic fetch and and
//
// template<class T>
// T atomic_fetch_and(volatile T* const dest, const T val)
// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
#include "impl/Kokkos_Atomic_Fetch_And.hpp"
#endif /*Not _WIN32*/
//----------------------------------------------------------------------------
// Memory fence
//
// All loads and stores from this thread will be globally consistent before continuing
//
// void memory_fence() {...};
#include "impl/Kokkos_Memory_Fence.hpp"
//----------------------------------------------------------------------------
// Provide volatile_load and safe_load
//
// T volatile_load(T const volatile * const ptr);
//
// T const& safe_load(T const * const ptr);
// XEON PHI
// T safe_load(T const * const ptr
#include "impl/Kokkos_Volatile_Load.hpp"
#ifndef _WIN32
#include "impl/Kokkos_Atomic_Generic.hpp"
#endif
//----------------------------------------------------------------------------
// This atomic-style macro should be an inlined function, not a macro
#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
#else
#define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
#define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
#endif
//----------------------------------------------------------------------------
#endif /* KOKKOS_ATOMIC_HPP */

View File

@ -0,0 +1,228 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CORE_HPP
#define KOKKOS_CORE_HPP
//----------------------------------------------------------------------------
// Include the execution space header files for the enabled execution spaces.
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_CUDA )
#include <Kokkos_Cuda.hpp>
#endif
#if defined( KOKKOS_HAVE_OPENMP )
#include <Kokkos_OpenMP.hpp>
#endif
#if defined( KOKKOS_HAVE_SERIAL )
#include <Kokkos_Serial.hpp>
#endif
#if defined( KOKKOS_HAVE_PTHREAD )
#include <Kokkos_Threads.hpp>
#endif
#include <Kokkos_Pair.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_Vectorization.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_hwloc.hpp>
#include <iostream>
//----------------------------------------------------------------------------
namespace Kokkos {
struct InitArguments {
int num_threads;
int num_numa;
int device_id;
InitArguments() {
num_threads = -1;
num_numa = -1;
device_id = -1;
}
};
void initialize(int& narg, char* arg[]);
void initialize(const InitArguments& args = InitArguments());
/** \brief Finalize the spaces that were initialized via Kokkos::initialize */
void finalize();
/** \brief Finalize all known execution spaces */
void finalize_all();
void fence();
}
#ifdef KOKKOS_HAVE_CXX11
namespace Kokkos {
namespace Impl {
// should only by used by kokkos_malloc and kokkos_free
struct MallocHelper
{
static void increment_ref_count( AllocationTracker const & tracker )
{
tracker.increment_ref_count();
}
static void decrement_ref_count( AllocationTracker const & tracker )
{
tracker.decrement_ref_count();
}
};
} // namespace Impl
/* Allocate memory from a memory space.
* The allocation is tracked in Kokkos memory tracking system, so
* leaked memory can be identified.
*/
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const std::string label, size_t count) {
typedef typename Arg::memory_space MemorySpace;
Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
Impl::MallocHelper::increment_ref_count( tracker );
return tracker.alloc_ptr();
}
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const size_t& count) {
return kokkos_malloc<Arg>("DefaultLabel",count);
}
/* Free memory from a memory space.
*/
template< class Arg = DefaultExecutionSpace>
void kokkos_free(const void* ptr) {
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
if (tracker.is_valid()) {
Impl::MallocHelper::decrement_ref_count( tracker );
}
}
template< class Arg = DefaultExecutionSpace>
const void* kokkos_realloc(const void* old_ptr, size_t size) {
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
tracker.reallocate(size);
return tracker.alloc_ptr();
}
} // namespace Kokkos
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_malloc( const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void kokkos_free( void * arg_alloc )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r = RecordHost::get_record( arg_alloc );
RecordBase::decrement( r );
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r_old = RecordHost::get_record( arg_alloc );
RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
} // namespace Experimental
} // namespace Kokkos
#endif

View File

@ -0,0 +1,170 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CORE_FWD_HPP
#define KOKKOS_CORE_FWD_HPP
//----------------------------------------------------------------------------
// Kokkos_Macros.hpp does introspection on configuration options
// and compiler environment then sets a collection of #define macros.
#include <Kokkos_Macros.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Forward declarations for class inter-relationships
namespace Kokkos {
class HostSpace ; ///< Memory space for main process and CPU execution spaces
#if defined( KOKKOS_HAVE_SERIAL )
class Serial ; ///< Execution space main process on CPU
#endif // defined( KOKKOS_HAVE_SERIAL )
#if defined( KOKKOS_HAVE_PTHREAD )
class Threads ; ///< Execution space with pthreads back-end
#endif
#if defined( KOKKOS_HAVE_OPENMP )
class OpenMP ; ///< OpenMP execution space
#endif
#if defined( KOKKOS_HAVE_CUDA )
class CudaSpace ; ///< Memory space on Cuda GPU
class CudaUVMSpace ; ///< Memory space on Cuda GPU with UVM
class CudaHostPinnedSpace ; ///< Memory space on Host accessible to Cuda GPU
class Cuda ; ///< Execution space for Cuda GPU
#endif
template<class ExecutionSpace, class MemorySpace>
struct Device;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Set the default execution space.
/// Define Kokkos::DefaultExecutionSpace as per configuration option
/// or chosen from the enabled execution spaces in the following order:
/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
namespace Kokkos {
#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
typedef Cuda DefaultExecutionSpace ;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace ;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultExecutionSpace ;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultExecutionSpace ;
#else
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
#endif
#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_HAVE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_HAVE_PTHREAD )
typedef Threads DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_HAVE_SERIAL )
typedef Serial DefaultHostExecutionSpace ;
#else
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
#endif
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Detect the active execution space and define its memory space.
// This is used to verify whether a running kernel can access
// a given memory space.
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA)
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace ;
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
#else
typedef void ActiveExecutionMemorySpace ;
#endif
template< class ActiveSpace , class MemorySpace >
struct VerifyExecutionCanAccessMemorySpace {
enum {value = 0};
};
template< class Space >
struct VerifyExecutionCanAccessMemorySpace< Space , Space >
{
enum {value = 1};
KOKKOS_INLINE_FUNCTION static void verify(void) {}
KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
};
} // namespace Impl
} // namespace Kokkos
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
namespace Kokkos {
void fence();
}
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */

View File

@ -0,0 +1,268 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_HPP
#define KOKKOS_CUDA_HPP
#include <Kokkos_Core_fwd.hpp>
// If CUDA execution space is enabled then use this header file.
#if defined( KOKKOS_HAVE_CUDA )
#include <iosfwd>
#include <vector>
#include <Kokkos_CudaSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class CudaExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/// \class Cuda
/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
///
/// An "execution space" represents a parallel execution model. It tells Kokkos
/// how to parallelize the execution of kernels in a parallel_for or
/// parallel_reduce. For example, the Threads execution space uses Pthreads or
/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
/// extensions, and the Serial execution space executes "parallel" kernels
/// sequentially. The Cuda execution space uses NVIDIA's CUDA programming
/// model to execute kernels in parallel on GPUs.
class Cuda {
public:
//! \name Type declarations that all Kokkos execution spaces must provide.
//@{
//! Tag this class as a kokkos execution space
typedef Cuda execution_space ;
#if defined( KOKKOS_USE_CUDA_UVM )
//! This execution space's preferred memory space.
typedef CudaUVMSpace memory_space ;
#else
//! This execution space's preferred memory space.
typedef CudaSpace memory_space ;
#endif
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
//! The size_type best suited for this execution space.
typedef memory_space::size_type size_type ;
//! This execution space's preferred array layout.
typedef LayoutLeft array_layout ;
//!
typedef ScratchMemorySpace< Cuda > scratch_memory_space ;
//@}
//--------------------------------------------------
//! \name Functions that all Kokkos devices must implement.
//@{
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
KOKKOS_INLINE_FUNCTION static int in_parallel() {
#if defined( __CUDA_ARCH__ )
return true;
#else
return false;
#endif
}
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence();
//! Free any resources being consumed by the device.
static void finalize();
//! Has been initialized
static int is_initialized();
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
//@}
//--------------------------------------------------
//! \name Cuda space instances
~Cuda() {}
Cuda();
explicit Cuda( const int instance_id );
Cuda( const Cuda & ) = default ;
Cuda( Cuda && ) = default ;
Cuda & operator = ( const Cuda & ) = default ;
Cuda & operator = ( Cuda && ) = default ;
//--------------------------------------------------------------------------
//! \name Device-specific functions
//@{
struct SelectDevice {
int cuda_device_id ;
SelectDevice() : cuda_device_id(0) {}
explicit SelectDevice( int id ) : cuda_device_id( id ) {}
};
//! Initialize, telling the CUDA run-time library which device to use.
static void initialize( const SelectDevice = SelectDevice()
, const size_t num_instances = 1 );
/// \brief Cuda device architecture of the selected device.
///
/// This matches the __CUDA_ARCH__ specification.
static size_type device_arch();
//! Query device count.
static size_type detect_device_count();
/** \brief Detect the available devices and their architecture
* as defined by the __CUDA_ARCH__ specification.
*/
static std::vector<unsigned> detect_device_arch();
cudaStream_t cuda_stream() const { return m_stream ; }
int cuda_device() const { return m_device ; }
//@}
//--------------------------------------------------------------------------
private:
cudaStream_t m_stream ;
int m_device ;
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::CudaSpace
, Kokkos::Cuda::scratch_memory_space
>
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
};
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::HostSpace
, Kokkos::Cuda::scratch_memory_space
>
{
enum { value = false };
inline static void verify( void ) { CudaSpace::access_error(); }
inline static void verify( const void * p ) { CudaSpace::access_error(p); }
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp>
#include <KokkosExp_View.hpp>
#include <Cuda/KokkosExp_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_HPP */

View File

@ -0,0 +1,656 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDASPACE_HPP
#define KOKKOS_CUDASPACE_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_CUDA )
#include <iosfwd>
#include <typeinfo>
#include <string>
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Cuda on-device memory management */
class CudaSpace {
public:
//! Tag this class as a kokkos memory space
typedef CudaSpace memory_space ;
typedef Kokkos::Cuda execution_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
typedef Impl::CudaMallocAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/*--------------------------------*/
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
/*--------------------------------*/
CudaSpace();
CudaSpace( const CudaSpace & rhs ) = default ;
CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
~CudaSpace() = default ;
/**\brief Allocate memory in the cuda space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/*--------------------------------*/
/** \brief Error reporting for HostSpace attempt to access CudaSpace */
static void access_error();
static void access_error( const void * const );
private:
int m_device ; ///< Which Cuda device
// friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
};
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
void init_lock_array_cuda_space();
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* lock_array_cuda_space_ptr(bool deallocate = false);
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Cuda memory that is accessible to Host execution space
* through Cuda's unified virtual memory (UVM) runtime.
*/
class CudaUVMSpace {
public:
//! Tag this class as a kokkos memory space
typedef CudaUVMSpace memory_space ;
typedef Cuda execution_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
/** \brief If UVM capability is available */
static bool available();
typedef Impl::CudaUVMAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
/*--------------------------------*/
CudaUVMSpace();
CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
~CudaUVMSpace() = default ;
/**\brief Allocate memory in the cuda space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/*--------------------------------*/
private:
int m_device ; ///< Which Cuda device
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Host memory that is accessible to Cuda execution space
* through Cuda's host-pinned memory allocation.
*/
class CudaHostPinnedSpace {
public:
//! Tag this class as a kokkos memory space
/** \brief Memory is in HostSpace so use the HostSpace::execution_space */
typedef HostSpace::execution_space execution_space ;
typedef CudaHostPinnedSpace memory_space ;
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
typedef Impl::CudaHostAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/*--------------------------------*/
CudaHostPinnedSpace();
CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
~CudaHostPinnedSpace() = default ;
/**\brief Allocate memory in the cuda space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
/*--------------------------------*/
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<> struct DeepCopy< CudaSpace , CudaSpace >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Cuda & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< CudaSpace , HostSpace >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Cuda & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< HostSpace , CudaSpace >
{
DeepCopy( void * dst , const void * src , size_t );
DeepCopy( const Cuda & , void * dst , const void * src , size_t );
};
template<> struct DeepCopy< CudaSpace , CudaUVMSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaSpace , CudaHostPinnedSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaUVMSpace , CudaSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaUVMSpace , CudaUVMSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaUVMSpace , HostSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaHostPinnedSpace , CudaSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
};
template<> struct DeepCopy< CudaHostPinnedSpace , HostSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
};
template<> struct DeepCopy< HostSpace , CudaUVMSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
};
template<> struct DeepCopy< HostSpace , CudaHostPinnedSpace >
{
inline
DeepCopy( void * dst , const void * src , size_t n )
{ (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** Running in CudaSpace attempting to access HostSpace: error */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static void verify( void )
{ Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
KOKKOS_INLINE_FUNCTION static void verify( const void * )
{ Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
};
/** Running in CudaSpace accessing CudaUVMSpace: ok */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
};
/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) { }
KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
};
/** Running in CudaSpace attempting to access an unknown space: error */
template< class OtherSpace >
struct VerifyExecutionCanAccessMemorySpace<
typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
OtherSpace >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static void verify( void )
{ Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
KOKKOS_INLINE_FUNCTION static void verify( const void * )
{ Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
};
//----------------------------------------------------------------------------
/** Running in HostSpace attempting to access CudaSpace */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
{
enum { value = false };
inline static void verify( void ) { CudaSpace::access_error(); }
inline static void verify( const void * p ) { CudaSpace::access_error(p); }
};
/** Running in HostSpace accessing CudaUVMSpace is OK */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
{
enum { value = true };
KOKKOS_INLINE_FUNCTION static void verify( void ) {}
KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::CudaSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
static ::cudaTextureObject_t
attach_texture_object( const unsigned sizeof_alias
, void * const alloc_ptr
, const size_t alloc_size );
static RecordBase s_root_record ;
::cudaTextureObject_t m_tex_obj ;
const Kokkos::CudaSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const ;
static SharedAllocationRecord * allocate( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
template< typename AliasType >
inline
::cudaTextureObject_t attach_texture_object()
{
static_assert( ( std::is_same< AliasType , int >::value ||
std::is_same< AliasType , ::int2 >::value ||
std::is_same< AliasType , ::int4 >::value )
, "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
if ( m_tex_obj == 0 ) {
m_tex_obj = attach_texture_object( sizeof(AliasType)
, (void*) RecordBase::m_alloc_ptr
, RecordBase::m_alloc_size );
}
return m_tex_obj ;
}
template< typename AliasType >
inline
int attach_texture_object_offset( const AliasType * const ptr )
{
// Texture object is attached to the entire allocation range
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
}
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
};
template<>
class SharedAllocationRecord< Kokkos::CudaUVMSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
static RecordBase s_root_record ;
::cudaTextureObject_t m_tex_obj ;
const Kokkos::CudaUVMSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const ;
static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
template< typename AliasType >
inline
::cudaTextureObject_t attach_texture_object()
{
static_assert( ( std::is_same< AliasType , int >::value ||
std::is_same< AliasType , ::int2 >::value ||
std::is_same< AliasType , ::int4 >::value )
, "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
if ( m_tex_obj == 0 ) {
m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >::
attach_texture_object( sizeof(AliasType)
, (void*) RecordBase::m_alloc_ptr
, RecordBase::m_alloc_size );
}
return m_tex_obj ;
}
template< typename AliasType >
inline
int attach_texture_object_offset( const AliasType * const ptr )
{
// Texture object is attached to the entire allocation range
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
}
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
};
template<>
class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
static RecordBase s_root_record ;
const Kokkos::CudaHostPinnedSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() : RecordBase(), m_space() {}
SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const ;
static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false );
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
#endif /* #define KOKKOS_CUDASPACE_HPP */

View File

@ -0,0 +1,497 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_EXECPOLICY_HPP
#define KOKKOS_EXECPOLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_Tags.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Execution policy for work over a range of an integral type.
*
* Valid template argument options:
*
* With a specified execution space:
* < ExecSpace , WorkTag , { IntConst | IntType } >
* < ExecSpace , WorkTag , void >
* < ExecSpace , { IntConst | IntType } , void >
* < ExecSpace , void , void >
*
* With the default execution space:
* < WorkTag , { IntConst | IntType } , void >
* < WorkTag , void , void >
* < { IntConst | IntType } , void , void >
* < void , void , void >
*
* IntType is a fundamental integral type
* IntConst is an Impl::integral_constant< IntType , Blocking >
*
* Blocking is the granularity of partitioning the range among threads.
*/
template< class Arg0 = void , class Arg1 = void , class Arg2 = void
, class ExecSpace =
// The first argument is the execution space,
// otherwise use the default execution space.
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
>
class RangePolicy {
private:
// Default integral type and blocking factor:
typedef int DefaultIntType ;
enum { DefaultIntValue = 8 };
enum { Arg0_Void = Impl::is_same< Arg0 , void >::value };
enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
enum { Arg2_Void = Impl::is_same< Arg2 , void >::value };
enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
enum { Arg0_IntConst = Impl::is_integral_constant< Arg0 >::value };
enum { Arg1_IntConst = Impl::is_integral_constant< Arg1 >::value };
enum { Arg2_IntConst = Impl::is_integral_constant< Arg2 >::value };
enum { Arg0_IntType = Impl::is_integral< Arg0 >::value };
enum { Arg1_IntType = Impl::is_integral< Arg1 >::value };
enum { Arg2_IntType = Impl::is_integral< Arg2 >::value };
enum { Arg0_WorkTag = ! Arg0_ExecSpace && ! Arg0_IntConst && ! Arg0_IntType && ! Arg0_Void };
enum { Arg1_WorkTag = Arg0_ExecSpace && ! Arg1_IntConst && ! Arg1_IntType && ! Arg1_Void };
enum { ArgOption_OK = Impl::StaticAssert< (
( Arg0_ExecSpace && Arg1_WorkTag && ( Arg2_IntConst || Arg2_IntType ) ) ||
( Arg0_ExecSpace && Arg1_WorkTag && Arg2_Void ) ||
( Arg0_ExecSpace && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
( Arg0_ExecSpace && Arg1_Void && Arg2_Void ) ||
( Arg0_WorkTag && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
( Arg0_WorkTag && Arg1_Void && Arg2_Void ) ||
( ( Arg0_IntConst || Arg0_IntType ) && Arg1_Void && Arg2_Void ) ||
( Arg0_Void && Arg1_Void && Arg2_Void )
) >::value };
// The work argument tag is the first or second argument
typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
typename Impl::if_c< Arg1_WorkTag , Arg1 , void
>::type >::type
WorkTag ;
enum { Granularity = Arg0_IntConst ? unsigned(Impl::is_integral_constant<Arg0>::integral_value) : (
Arg1_IntConst ? unsigned(Impl::is_integral_constant<Arg1>::integral_value) : (
Arg2_IntConst ? unsigned(Impl::is_integral_constant<Arg2>::integral_value) : (
unsigned(DefaultIntValue) ))) };
// Only accept the integral type if the blocking is a power of two
typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
typename Impl::if_c< Arg0_IntType , Arg0 ,
typename Impl::if_c< Arg1_IntType , Arg1 ,
typename Impl::if_c< Arg2_IntType , Arg2 ,
typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
DefaultIntType
>::type >::type >::type
>::type >::type >::type
>::type
IntType ;
enum { GranularityMask = IntType(Granularity) - 1 };
ExecSpace m_space ;
IntType m_begin ;
IntType m_end ;
public:
//! Tag this class as an execution policy
typedef ExecSpace execution_space ;
typedef RangePolicy execution_policy ;
typedef WorkTag work_tag ;
typedef IntType member_type ;
KOKKOS_INLINE_FUNCTION const execution_space & space() const { return m_space ; }
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
/** \brief Total range */
inline
RangePolicy( const member_type work_begin
, const member_type work_end
)
: m_space()
, m_begin( work_begin < work_end ? work_begin : 0 )
, m_end( work_begin < work_end ? work_end : 0 )
{}
/** \brief Total range */
inline
RangePolicy( const execution_space & work_space
, const member_type work_begin
, const member_type work_end
)
: m_space( work_space )
, m_begin( work_begin < work_end ? work_begin : 0 )
, m_end( work_begin < work_end ? work_end : 0 )
{}
/** \brief Subrange for a partition's rank and size.
*
* Typically used to partition a range over a group of threads.
*/
struct WorkRange {
typedef RangePolicy::work_tag work_tag ;
typedef RangePolicy::member_type member_type ;
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
/** \brief Subrange for a partition's rank and size.
*
* Typically used to partition a range over a group of threads.
*/
KOKKOS_INLINE_FUNCTION
WorkRange( const RangePolicy & range
, const int part_rank
, const int part_size
)
: m_begin(0), m_end(0)
{
if ( part_size ) {
// Split evenly among partitions, then round up to the granularity.
const member_type work_part =
( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
+ GranularityMask ) & ~member_type(GranularityMask);
m_begin = range.begin() + work_part * part_rank ;
m_end = m_begin + work_part ;
if ( range.end() < m_begin ) m_begin = range.end() ;
if ( range.end() < m_end ) m_end = range.end() ;
}
}
private:
member_type m_begin ;
member_type m_end ;
WorkRange();
WorkRange & operator = ( const WorkRange & );
};
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Execution policy for parallel work over a league of teams of threads.
*
* The work functor is called for each thread of each team such that
* the team's member threads are guaranteed to be concurrent.
*
* The team's threads have access to team shared scratch memory and
* team collective operations.
*
* If the WorkTag is non-void then the first calling argument of the
* work functor's parentheses operator is 'const WorkTag &'.
* This allows a functor to have multiple work member functions.
*
* template argument option with specified execution space:
* < ExecSpace , WorkTag >
* < ExecSpace , void >
*
* template argument option with default execution space:
* < WorkTag , void >
* < void , void >
*/
template< class Arg0 = void
, class Arg1 = void
, class ExecSpace =
// If the first argument is not an execution
// then use the default execution space.
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
>
class TeamPolicy {
private:
enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
enum { ArgOption_OK = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
public:
//! Tag this class as an execution policy
typedef TeamPolicy execution_policy ;
typedef ExecSpace execution_space ;
typedef WorkTag work_tag ;
//----------------------------------------
/** \brief Query maximum team size for a given functor.
*
* This size takes into account execution space concurrency limitations and
* scratch memory space limitations for reductions, team reduce/scan, and
* team shared memory.
*/
template< class FunctorType >
static int team_size_max( const FunctorType & );
/** \brief Query recommended team size for a given functor.
*
* This size takes into account execution space concurrency limitations and
* scratch memory space limitations for reductions, team reduce/scan, and
* team shared memory.
*/
template< class FunctorType >
static int team_size_recommended( const FunctorType & );
template< class FunctorType >
static int team_size_recommended( const FunctorType & , const int&);
//----------------------------------------
/** \brief Construct policy with the given instance of the execution space */
TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
/** \brief Construct policy with the default instance of the execution space */
TeamPolicy( int league_size_request , int team_size_request );
/** \brief The actual league size (number of teams) of the policy.
*
* This may be smaller than the requested league size due to limitations
* of the execution space.
*/
KOKKOS_INLINE_FUNCTION int league_size() const ;
/** \brief The actual team size (number of threads per team) of the policy.
*
* This may be smaller than the requested team size due to limitations
* of the execution space.
*/
KOKKOS_INLINE_FUNCTION int team_size() const ;
/** \brief Parallel execution of a functor calls the functor once with
* each member of the execution policy.
*/
struct member_type {
/** \brief Handle to the currently executing team shared scratch memory */
KOKKOS_INLINE_FUNCTION
typename execution_space::scratch_memory_space team_shmem() const ;
/** \brief Rank of this team within the league of teams */
KOKKOS_INLINE_FUNCTION int league_rank() const ;
/** \brief Number of teams in the league */
KOKKOS_INLINE_FUNCTION int league_size() const ;
/** \brief Rank of this thread within this team */
KOKKOS_INLINE_FUNCTION int team_rank() const ;
/** \brief Number of threads in this team */
KOKKOS_INLINE_FUNCTION int team_size() const ;
/** \brief Barrier among the threads of this team */
KOKKOS_INLINE_FUNCTION void team_barrier() const ;
/** \brief Intra-team reduction. Returns join of all values of the team members. */
template< class JoinOp >
KOKKOS_INLINE_FUNCTION
typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
, const JoinOp & ) const ;
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
};
};
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<typename iType, class TeamMemberType>
struct TeamThreadRangeBoundariesStruct {
private:
KOKKOS_INLINE_FUNCTION static
iType ibegin( const iType & arg_begin
, const iType & arg_end
, const iType & arg_rank
, const iType & arg_size
)
{
return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ;
}
KOKKOS_INLINE_FUNCTION static
iType iend( const iType & arg_begin
, const iType & arg_end
, const iType & arg_rank
, const iType & arg_size
)
{
const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 );
return end_ < arg_end ? end_ : arg_end ;
}
public:
typedef iType index_type;
const iType start;
const iType end;
enum {increment = 1};
const TeamMemberType& thread;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
, const iType& arg_end
)
: start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, end( iend( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, thread( arg_thread )
{}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
, const iType& arg_begin
, const iType& arg_end
)
: start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, end( iend( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
, thread( arg_thread )
{}
};
template<typename iType, class TeamMemberType>
struct ThreadVectorRangeBoundariesStruct {
typedef iType index_type;
enum {start = 0};
const iType end;
enum {increment = 1};
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
end( count )
{}
};
template<class TeamMemberType>
struct ThreadSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
};
template<class TeamMemberType>
struct VectorSingleStruct {
const TeamMemberType& team_member;
KOKKOS_INLINE_FUNCTION
VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
};
} // namespace Impl
/** \brief Execution policy for parallel work over a threads within a team.
*
* The range is split over all threads in a team. The Mapping scheme depends on the architecture.
* This policy is used together with a parallel pattern as a nested layer within a kernel launched
* with the TeamPolicy. This variant expects a single count. So the range is (0,count].
*/
template<typename iType, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
/** \brief Execution policy for parallel work over a threads within a team.
*
* The range is split over all threads in a team. The Mapping scheme depends on the architecture.
* This policy is used together with a parallel pattern as a nested layer within a kernel launched
* with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
*/
template<typename iType, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
/** \brief Execution policy for a vector parallel loop.
*
* The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture.
* This policy is used together with a parallel pattern as a nested layer within a kernel launched
* with the TeamPolicy. This variant expects a single count. So the range is (0,count].
*/
template<typename iType, class TeamMemberType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
} // namespace Kokkos
#endif /* #define KOKKOS_EXECPOLICY_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,270 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HOSTSPACE_HPP
#define KOKKOS_HOSTSPACE_HPP
#include <cstring>
#include <string>
#include <iosfwd>
#include <typeinfo>
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/KokkosExp_SharedAlloc.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
void init_lock_array_host_space();
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
bool lock_address_host_space(void* ptr);
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
void unlock_address_host_space(void* ptr);
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
/// \class HostSpace
/// \brief Memory management for host memory.
///
/// HostSpace is a memory space that governs host memory. "Host"
/// memory means the usual CPU-accessible memory.
class HostSpace {
public:
//! Tag this class as a kokkos memory space
typedef HostSpace memory_space ;
typedef size_t size_type ;
/// \typedef execution_space
/// \brief Default execution space for this memory space.
///
/// Every memory space has a default execution space. This is
/// useful for things like initializing a View (which happens in
/// parallel using the View's default execution space).
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Kokkos::Threads execution_space ;
#elif defined( KOKKOS_HAVE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
#elif defined( KOKKOS_HAVE_PTHREAD )
typedef Kokkos::Threads execution_space ;
#elif defined( KOKKOS_HAVE_SERIAL )
typedef Kokkos::Serial execution_space ;
#else
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
#endif
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
typedef Impl::PageAlignedAllocator allocator ;
#else
typedef Impl::AlignedAllocator allocator ;
#endif
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/*--------------------------------*/
/* Functions unique to the HostSpace */
static int in_parallel();
static void register_in_parallel( int (*)() );
/*--------------------------------*/
/**\brief Default memory space instance */
HostSpace();
HostSpace( const HostSpace & rhs ) = default ;
HostSpace & operator = ( const HostSpace & ) = default ;
~HostSpace() = default ;
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
explicit
HostSpace( const AllocationMechanism & );
/**\brief Allocate memory in the host space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the host space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
private:
AllocationMechanism m_alloc_mech ;
friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::HostSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
friend Kokkos::HostSpace ;
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
/**\brief Root record for tracked allocations from this HostSpace instance */
static RecordBase s_root_record ;
const Kokkos::HostSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() = default ;
SharedAllocationRecord( const Kokkos::HostSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
inline
std::string get_label() const
{
return std::string( RecordBase::head()->m_label );
}
KOKKOS_INLINE_FUNCTION static
SharedAllocationRecord * allocate( const Kokkos::HostSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
#else
return (SharedAllocationRecord *) 0 ;
#endif
}
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class , class > struct DeepCopy ;
template<>
struct DeepCopy<HostSpace,HostSpace> {
DeepCopy( void * dst , const void * src , size_t n );
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_HOSTSPACE_HPP */

View File

@ -0,0 +1,174 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Layout.hpp
/// \brief Declaration of various \c MemoryLayout options.
#ifndef KOKKOS_LAYOUT_HPP
#define KOKKOS_LAYOUT_HPP
#include <stddef.h>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
namespace Kokkos {
//----------------------------------------------------------------------------
/// \struct LayoutLeft
/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
/// striding of multi-indices.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Layout left" indicates a mapping where the leftmost index i0
/// refers to contiguous access, and strides increase for dimensions
/// going right from there (i1, i2, ...). This layout imitates how
/// Fortran stores multi-dimensional arrays. For the special case of
/// a two-dimensional array, "layout left" is also called "column
/// major."
struct LayoutLeft {
//! Tag this class as a kokkos array layout
typedef LayoutLeft array_layout ;
};
//----------------------------------------------------------------------------
/// \struct LayoutRight
/// \brief Memory layout tag indicating right-to-left (C or
/// lexigraphical scheme) striding of multi-indices.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Right layout" indicates a mapping where the rightmost index ik
/// refers to contiguous access, and strides increase for dimensions
/// going left from there. This layout imitates how C stores
/// multi-dimensional arrays. For the special case of a
/// two-dimensional array, "layout right" is also called "row major."
struct LayoutRight {
//! Tag this class as a kokkos array layout
typedef LayoutRight array_layout ;
};
//----------------------------------------------------------------------------
/// \struct LayoutStride
/// \brief Memory layout tag indicated arbitrarily strided
/// multi-index mapping into contiguous memory.
struct LayoutStride {
//! Tag this class as a kokkos array layout
typedef LayoutStride array_layout ;
enum { MAX_RANK = 8 };
size_t dimension[ MAX_RANK ] ;
size_t stride[ MAX_RANK ] ;
/** \brief Compute strides from ordered dimensions.
*
* Values of order uniquely form the set [0..rank)
* and specify ordering of the dimensions.
* Order = {0,1,2,...} is LayoutLeft
* Order = {...,2,1,0} is LayoutRight
*/
template< typename iTypeOrder , typename iTypeDimen >
KOKKOS_INLINE_FUNCTION static
LayoutStride order_dimensions( int const rank
, iTypeOrder const * const order
, iTypeDimen const * const dimen )
{
LayoutStride tmp ;
// Verify valid rank order:
int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
for ( int r = 0 ; r < MAX_RANK ; ++r ) {
tmp.dimension[r] = 0 ;
tmp.stride[r] = 0 ;
check_input &= ~int( 1 << order[r] );
}
if ( 0 == check_input ) {
size_t n = 1 ;
for ( int r = 0 ; r < rank ; ++r ) {
tmp.stride[ order[r] ] = n ;
n *= ( dimen[order[r]] );
tmp.dimension[r] = dimen[r];
}
}
return tmp ;
}
};
//----------------------------------------------------------------------------
/// \struct LayoutTileLeft
/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
/// striding of multi-indices by tiles.
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Tiled layout" indicates a mapping to contiguously stored
/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
/// dimensions. Indices are LayoutLeft within each tile, and the
/// tiles themselves are arranged using LayoutLeft. Note that the
/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
/// compile-time constants. This speeds up index calculations. If
/// both tile dimensions are powers of two, Kokkos can optimize
/// further.
template < unsigned ArgN0 , unsigned ArgN1 ,
bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
Impl::is_power_of_two<ArgN1>::value )
>
struct LayoutTileLeft {
//! Tag this class as a kokkos array layout
typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
enum { N0 = ArgN0 };
enum { N1 = ArgN1 };
};
} // namespace Kokkos
#endif // #ifndef KOKKOS_LAYOUT_HPP

View File

@ -0,0 +1,397 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_MACROS_HPP
#define KOKKOS_MACROS_HPP
//----------------------------------------------------------------------------
/** Pick up configure/build options via #define macros:
*
* KOKKOS_HAVE_CUDA Kokkos::Cuda execution and memory spaces
* KOKKOS_HAVE_PTHREAD Kokkos::Threads execution space
* KOKKOS_HAVE_QTHREAD Kokkos::Qthread execution space
* KOKKOS_HAVE_OPENMP Kokkos::OpenMP execution space
* KOKKOS_HAVE_HWLOC HWLOC library is available
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK insert array bounds checks, is expensive!
* KOKKOS_HAVE_CXX11 enable C++11 features
*
* KOKKOS_HAVE_MPI negotiate MPI/execution space interactions
*
* KOKKOS_USE_CUDA_UVM Use CUDA UVM for Cuda memory space
*/
#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
#include <KokkosCore_config.h>
#endif
//----------------------------------------------------------------------------
/** Pick up compiler specific #define macros:
*
* Macros for known compilers evaluate to an integral version value
*
* KOKKOS_COMPILER_NVCC
* KOKKOS_COMPILER_GNU
* KOKKOS_COMPILER_INTEL
* KOKKOS_COMPILER_IBM
* KOKKOS_COMPILER_CRAYC
* KOKKOS_COMPILER_APPLECC
* KOKKOS_COMPILER_CLANG
* KOKKOS_COMPILER_PGI
*
* Macros for which compiler extension to use for atomics on intrinsice types
*
* KOKKOS_ATOMICS_USE_CUDA
* KOKKOS_ATOMICS_USE_GNU
* KOKKOS_ATOMICS_USE_INTEL
* KOKKOS_ATOMICS_USE_OPENMP31
*
* A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
*
* Macros for marking functions to run in an execution space:
*
* KOKKOS_FUNCTION
* KOKKOS_INLINE_FUNCTION request compiler to inline
* KOKKOS_FORCEINLINE_FUNCTION force compiler to inline, use with care!
*/
//----------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
/* Compiling with a CUDA compiler.
*
* Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
* CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
*
* When generating device code the __CUDA_ARCH__ macro is defined as:
* __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
*/
#include <cuda_runtime.h>
#include <cuda.h>
#if ! defined( CUDA_VERSION )
#error "#include <cuda.h> did not define CUDA_VERSION"
#endif
#if ( CUDA_VERSION < 6050 )
// CUDA supports (inofficially) C++11 in device code starting with
// version 6.5. This includes auto type and device code internal
// lambdas.
#error "Cuda version 6.5 or greater required"
#endif
#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
/* Compiling with CUDA compiler for device code. */
#error "Cuda device capability >= 3.0 is required"
#endif
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
/*--------------------------------------------------------------------------*/
/* Language info: C++, CUDA, OPENMP */
#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
// Compiling Cuda code to 'ptx'
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
#define KOKKOS_FUNCTION __device__ __host__
#endif /* #if defined( __CUDA_ARCH__ ) */
#if defined( _OPENMP )
/* Compiling with OpenMP.
* The value of _OPENMP is an integer value YYYYMM
* where YYYY and MM are the year and month designation
* of the supported OpenMP API version.
*/
#endif /* #if defined( _OPENMP ) */
/*--------------------------------------------------------------------------*/
/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
#if defined( __NVCC__ )
// NVIDIA compiler is being used.
// Code is parsed and separated into host and device code.
// Host code is compiled again with another compiler.
// Device code is compile to 'ptx'.
#define KOKKOS_COMPILER_NVCC __NVCC__
#else
#if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA )
// CUDA (including version 6.5) does not support giving lambdas as
// arguments to global functions. Thus its not currently possible
// to dispatch lambdas from the host.
#define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif /* #if defined( __NVCC__ ) */
#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA)
#define KOKKOS_LAMBDA [=]
#endif
#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
/* Intel compiler for host code */
#if defined( __INTEL_COMPILER )
#define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
#elif defined( __ICC )
// Old define
#define KOKKOS_COMPILER_INTEL __ICC
#elif defined( __ECC )
// Very old define
#define KOKKOS_COMPILER_INTEL __ECC
#endif
/* CRAY compiler for host code */
#if defined( _CRAYC )
#define KOKKOS_COMPILER_CRAYC _CRAYC
#endif
#if defined( __IBMCPP__ )
// IBM C++
#define KOKKOS_COMPILER_IBM __IBMCPP__
#elif defined( __IBMC__ )
#define KOKKOS_COMPILER_IBM __IBMC__
#endif
#if defined( __APPLE_CC__ )
#define KOKKOS_COMPILER_APPLECC __APPLE_CC__
#endif
#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
#define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
#endif
#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
#define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
#endif
#if defined( __PGIC__ ) && ! defined( __GNUC__ )
#define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
#endif
#endif /* #if ! defined( __CUDA_ARCH__ ) */
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Intel compiler macros */
#if defined( KOKKOS_COMPILER_INTEL )
#define KOKKOS_HAVE_PRAGMA_UNROLL 1
#define KOKKOS_HAVE_PRAGMA_IVDEP 1
#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
#define KOKKOS_HAVE_PRAGMA_VECTOR 1
#define KOKKOS_HAVE_PRAGMA_SIMD 1
#if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
#define KOKKOS_ENABLE_ASM 1
#endif
#if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#if !defined (_WIN32)
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#else
#define KOKKOS_FORCEINLINE_FUNCTION inline
#endif
#endif
#if defined( __MIC__ )
// Compiling for Xeon Phi
#endif
#endif
/*--------------------------------------------------------------------------*/
/* Cray compiler macros */
#if defined( KOKKOS_COMPILER_CRAYC )
#endif
/*--------------------------------------------------------------------------*/
/* IBM Compiler macros */
#if defined( KOKKOS_COMPILER_IBM )
#define KOKKOS_HAVE_PRAGMA_UNROLL 1
//#define KOKKOS_HAVE_PRAGMA_IVDEP 1
//#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_HAVE_PRAGMA_VECTOR 1
//#define KOKKOS_HAVE_PRAGMA_SIMD 1
#endif
/*--------------------------------------------------------------------------*/
/* CLANG compiler macros */
#if defined( KOKKOS_COMPILER_CLANG )
//#define KOKKOS_HAVE_PRAGMA_UNROLL 1
//#define KOKKOS_HAVE_PRAGMA_IVDEP 1
//#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_HAVE_PRAGMA_VECTOR 1
//#define KOKKOS_HAVE_PRAGMA_SIMD 1
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#endif
/*--------------------------------------------------------------------------*/
/* GNU Compiler macros */
#if defined( KOKKOS_COMPILER_GNU )
//#define KOKKOS_HAVE_PRAGMA_UNROLL 1
//#define KOKKOS_HAVE_PRAGMA_IVDEP 1
//#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_HAVE_PRAGMA_VECTOR 1
//#define KOKKOS_HAVE_PRAGMA_SIMD 1
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#if ! defined( KOKKOS_ENABLE_ASM ) && \
! ( defined( __powerpc) || \
defined(__powerpc__) || \
defined(__powerpc64__) || \
defined(__POWERPC__) || \
defined(__ppc__) || \
defined(__ppc64__) || \
defined(__PGIC__) )
#define KOKKOS_ENABLE_ASM 1
#endif
#endif
/*--------------------------------------------------------------------------*/
#if defined( KOKKOS_COMPILER_PGI )
#define KOKKOS_HAVE_PRAGMA_UNROLL 1
#define KOKKOS_HAVE_PRAGMA_IVDEP 1
//#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
#define KOKKOS_HAVE_PRAGMA_VECTOR 1
//#define KOKKOS_HAVE_PRAGMA_SIMD 1
#endif
/*--------------------------------------------------------------------------*/
#if defined( KOKKOS_COMPILER_NVCC )
#if defined(__CUDA_ARCH__ )
#define KOKKOS_HAVE_PRAGMA_UNROLL 1
#endif
#endif
//----------------------------------------------------------------------------
/** Define function marking macros if compiler specific macros are undefined: */
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline
#endif
#if ! defined( KOKKOS_INLINE_FUNCTION )
#define KOKKOS_INLINE_FUNCTION inline
#endif
#if ! defined( KOKKOS_FUNCTION )
#define KOKKOS_FUNCTION /**/
#endif
//----------------------------------------------------------------------------
/** Determine the default execution space for parallel dispatch.
* There is zero or one default execution space specified.
*/
#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
#endif
/** If default is not specified then chose from enabled execution spaces.
* Priority: CUDA, OPENMP, THREADS, SERIAL
*/
#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined ( KOKKOS_HAVE_CUDA )
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined ( KOKKOS_HAVE_OPENMP )
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
#elif defined ( KOKKOS_HAVE_PTHREAD )
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
#else
#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
#endif
//----------------------------------------------------------------------------
/** Determine for what space the code is being compiled: */
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
#else
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_MACROS_HPP */

View File

@ -0,0 +1,116 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_MEMORYTRAITS_HPP
#define KOKKOS_MEMORYTRAITS_HPP
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Memory access traits for views, an extension point.
*
* These traits should be orthogonal. If there are dependencies then
* the MemoryTraits template must detect and enforce dependencies.
*
* A zero value is the default for a View, indicating that none of
* these traits are present.
*/
enum MemoryTraitsFlags
{ Unmanaged = 0x01
, RandomAccess = 0x02
, Atomic = 0x04
};
template < unsigned T >
struct MemoryTraits {
//! Tag this class as a kokkos memory traits:
typedef MemoryTraits memory_traits ;
enum { Unmanaged = T & unsigned(Kokkos::Unmanaged) };
enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
enum { Atomic = T & unsigned(Kokkos::Atomic) };
};
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
typedef Kokkos::MemoryTraits<0> MemoryManaged ;
typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Memory alignment settings
*
* Sets global value for memory alignment. Must be a power of two!
* Enable compatibility of views from different devices with static stride.
* Use compiler flag to enable overwrites.
*/
enum { MEMORY_ALIGNMENT =
#if defined( KOKKOS_MEMORY_ALIGNMENT )
( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
#else
( 1 << Kokkos::Impl::power_of_two< 128 >::value )
#endif
, MEMORY_ALIGNMENT_THRESHOLD = 4
};
} //namespace Impl
} // namespace Kokkos
#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */

View File

@ -0,0 +1,175 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMP_HPP
#define KOKKOS_OPENMP_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
#include <omp.h>
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/// \class OpenMP
/// \brief Kokkos device for multicore processors in the host memory space.
class OpenMP {
public:
//------------------------------------
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as a kokkos execution space
typedef OpenMP execution_space ;
typedef HostSpace memory_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef LayoutRight array_layout ;
typedef HostSpace::size_type size_type ;
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
//@}
//------------------------------------
//! \name Functions that all Kokkos devices must implement.
//@{
inline static bool in_parallel() { return omp_in_parallel(); }
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
static bool sleep();
/** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
static bool wake();
/** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
static void fence() {}
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
/// \brief Free any resources being consumed by the device.
static void finalize();
/** \brief Initialize the device.
*
* 1) If the hardware locality library is enabled and OpenMP has not
* already bound threads then bind OpenMP threads to maximize
* core utilization and group for memory hierarchy locality.
*
* 2) Allocate a HostThread for each OpenMP thread to hold its
* topology and fan in/out data.
*/
static void initialize( unsigned thread_count = 0 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 );
static int is_initialized();
//@}
//------------------------------------
/** \brief This execution space has a topological thread pool which can be queried.
*
* All threads within a pool have a common memory space for which they are cache coherent.
* depth = 0 gives the number of threads in the whole pool.
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
*/
inline static int thread_pool_size( int depth = 0 );
/** \brief The rank of the executing thread in this thread pool */
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
//------------------------------------
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
KOKKOS_INLINE_FUNCTION static
unsigned hardware_thread_id() { return thread_pool_rank(); }
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::OpenMP::memory_space
, Kokkos::OpenMP::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
/*--------------------------------------------------------------------------*/
#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
#endif /* #ifndef KOKKOS_OPENMP_HPP */

View File

@ -0,0 +1,498 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
/// \file Kokkos_Pair.hpp
/// \brief Declaration and definition of Kokkos::pair.
///
/// This header file declares and defines Kokkos::pair and its related
/// nonmember functions.
#ifndef KOKKOS_PAIR_HPP
#define KOKKOS_PAIR_HPP
#include <Kokkos_Macros.hpp>
#include <utility>
namespace Kokkos {
/// \struct pair
/// \brief Replacement for std::pair that works on CUDA devices.
///
/// The instance methods of std::pair, including its constructors, are
/// not marked as <tt>__device__</tt> functions. Thus, they cannot be
/// called on a CUDA device, such as an NVIDIA GPU. This struct
/// implements the same interface as std::pair, but can be used on a
/// CUDA device as well as on the host.
template <class T1, class T2>
struct pair
{
//! The first template parameter of this class.
typedef T1 first_type;
//! The second template parameter of this class.
typedef T2 second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Default constructor.
///
/// This calls the default constructors of T1 and T2. It won't
/// compile if those default constructors are not defined and
/// public.
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first(), second()
{}
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type const& f, second_type const& s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1, T2> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1&, T2&>
{
//! The first template parameter of this class.
typedef T1& first_type;
//! The second template parameter of this class.
typedef T2& second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type f, second_type s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1, T2&>
{
//! The first template parameter of this class.
typedef T1 first_type;
//! The second template parameter of this class.
typedef T2& second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type const& f, second_type s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
template <class T1, class T2>
struct pair<T1&, T2>
{
//! The first template parameter of this class.
typedef T1& first_type;
//! The second template parameter of this class.
typedef T2 second_type;
//! The first element of the pair.
first_type first;
//! The second element of the pair.
second_type second;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
pair(first_type f, second_type const& s)
: first(f), second(s)
{}
/// \brief Copy constructor.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
// from std::pair<U,V>
template <class U, class V>
pair( const std::pair<U,V> &p)
: first(p.first), second(p.second)
{}
/// \brief Assignment operator.
///
/// This calls the assignment operators of T1 and T2. It won't
/// compile if the assignment operators are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
pair<first_type, second_type> & operator=(const pair<U,V> &p)
{
first = p.first;
second = p.second;
return *this;
}
/// \brief Return the std::pair version of this object.
///
/// This is <i>not</i> a device function; you may not call it on a
/// CUDA device. It is meant to be called on the host, if the user
/// wants an std::pair instead of a Kokkos::pair.
///
/// \note This is not a conversion operator, since defining a
/// conversion operator made the relational operators have
/// ambiguous definitions.
std::pair<T1,T2> to_std_pair() const
{ return std::make_pair(first,second); }
};
//! Equality operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return lhs.first==rhs.first && lhs.second==rhs.second; }
//! Inequality operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs==rhs); }
//! Less-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator< (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
//! Less-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(rhs<lhs); }
//! Greater-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator> (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return rhs<lhs; }
//! Greater-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs<rhs); }
/// \brief Return a new pair.
///
/// This is a "nonmember constructor" for Kokkos::pair. It works just
/// like std::make_pair.
template <class T1,class T2>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1,T2> make_pair (T1 x, T2 y)
{ return ( pair<T1,T2>(x,y) ); }
/// \brief Return a pair of references to the input arguments.
///
/// This compares to std::tie (new in C++11). You can use it to
/// assign to two variables at once, from the result of a function
/// that returns a pair. For example (<tt>__device__</tt> and
/// <tt>__host__</tt> attributes omitted for brevity):
/// \code
/// // Declaration of the function to call.
/// // First return value: operation count.
/// // Second return value: whether all operations succeeded.
/// Kokkos::pair<int, bool> someFunction ();
///
/// // Code that uses Kokkos::tie.
/// int myFunction () {
/// int count = 0;
/// bool success = false;
///
/// // This assigns to both count and success.
/// Kokkos::tie (count, success) = someFunction ();
///
/// if (! success) {
/// // ... Some operation failed;
/// // take corrective action ...
/// }
/// return count;
/// }
/// \endcode
///
/// The line that uses tie() could have been written like this:
/// \code
/// Kokkos::pair<int, bool> result = someFunction ();
/// count = result.first;
/// success = result.second;
/// \endcode
///
/// Using tie() saves two lines of code and avoids a copy of each
/// element of the pair. The latter could be significant if one or
/// both elements of the pair are more substantial objects than \c int
/// or \c bool.
template <class T1,class T2>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1 &,T2 &> tie (T1 & x, T2 & y)
{ return ( pair<T1 &,T2 &>(x,y) ); }
//
// Specialization of Kokkos::pair for a \c void second argument. This
// is not actually a "pair"; it only contains one element, the first.
//
template <class T1>
struct pair<T1,void>
{
typedef T1 first_type;
typedef void second_type;
first_type first;
enum { second = 0 };
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first()
{}
KOKKOS_FORCEINLINE_FUNCTION
pair(const first_type & f)
: first(f)
{}
KOKKOS_FORCEINLINE_FUNCTION
pair(const first_type & f, int)
: first(f)
{}
template <class U>
KOKKOS_FORCEINLINE_FUNCTION
pair( const pair<U,void> &p)
: first(p.first)
{}
template <class U>
KOKKOS_FORCEINLINE_FUNCTION
pair<T1, void> & operator=(const pair<U,void> &p)
{
first = p.first;
return *this;
}
};
//
// Specialization of relational operators for Kokkos::pair<T1,void>.
//
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first==rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs==rhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator< (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first<rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(rhs<lhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator> (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return rhs<lhs; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs<rhs); }
} // namespace Kokkos
#endif //KOKKOS_PAIR_HPP

View File

@ -0,0 +1,908 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Parallel.hpp
/// \brief Declaration of parallel operators
#ifndef KOKKOS_PARALLEL_HPP
#define KOKKOS_PARALLEL_HPP
#include <cstddef>
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_ExecPolicy.hpp>
#ifdef KOKKOSP_ENABLE_PROFILING
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#ifdef KOKKOS_HAVE_DEBUG
#include<iostream>
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
/** \brief Given a Functor and Execution Policy query an execution space.
*
* if the Policy has an execution space use that
* else if the Functor has an execution_space use that
* else if the Functor has a device_type use that for backward compatibility
* else use the default
*/
template< class Functor
, class Policy
, class EnableFunctor = void
, class EnablePolicy = void
>
struct FunctorPolicyExecutionSpace {
typedef Kokkos::DefaultExecutionSpace execution_space ;
};
template< class Functor , class Policy >
struct FunctorPolicyExecutionSpace
< Functor , Policy
, typename enable_if_type< typename Functor::device_type >::type
, typename enable_if_type< typename Policy ::execution_space >::type
>
{
typedef typename Policy ::execution_space execution_space ;
};
template< class Functor , class Policy >
struct FunctorPolicyExecutionSpace
< Functor , Policy
, typename enable_if_type< typename Functor::execution_space >::type
, typename enable_if_type< typename Policy ::execution_space >::type
>
{
typedef typename Policy ::execution_space execution_space ;
};
template< class Functor , class Policy , class EnableFunctor >
struct FunctorPolicyExecutionSpace
< Functor , Policy
, EnableFunctor
, typename enable_if_type< typename Policy::execution_space >::type
>
{
typedef typename Policy ::execution_space execution_space ;
};
template< class Functor , class Policy , class EnablePolicy >
struct FunctorPolicyExecutionSpace
< Functor , Policy
, typename enable_if_type< typename Functor::device_type >::type
, EnablePolicy
>
{
typedef typename Functor::device_type execution_space ;
};
template< class Functor , class Policy , class EnablePolicy >
struct FunctorPolicyExecutionSpace
< Functor , Policy
, typename enable_if_type< typename Functor::execution_space >::type
, EnablePolicy
>
{
typedef typename Functor::execution_space execution_space ;
};
//----------------------------------------------------------------------------
/// \class ParallelFor
/// \brief Implementation of the ParallelFor operator that has a
/// partial specialization for the device.
///
/// This is an implementation detail of parallel_for. Users should
/// skip this and go directly to the nonmember function parallel_for.
template< class FunctorType , class ExecPolicy > class ParallelFor ;
/// \class ParallelReduce
/// \brief Implementation detail of parallel_reduce.
///
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType , class ExecPolicy > class ParallelReduce ;
/// \class ParallelScan
/// \brief Implementation detail of parallel_scan.
///
/// This is an implementation detail of parallel_scan. Users should
/// skip this and go directly to the documentation of the nonmember
/// template function Kokkos::parallel_scan.
template< class FunctorType , class ExecPolicy > class ParallelScan ;
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/** \brief Execute \c functor in parallel according to the execution \c policy.
*
* A "functor" is a class containing the function to execute in parallel,
* data needed for that execution, and an optional \c execution_space
* typedef. Here is an example functor for parallel_for:
*
* \code
* class FunctorType {
* public:
* typedef ... execution_space ;
* void operator() ( WorkType iwork ) const ;
* };
* \endcode
*
* In the above example, \c WorkType is any integer type for which a
* valid conversion from \c size_t to \c IntType exists. Its
* <tt>operator()</tt> method defines the operation to parallelize,
* over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
* This compares to a single iteration \c iwork of a \c for loop.
* If \c execution_space is not defined DefaultExecutionSpace will be used.
*/
template< class ExecPolicy , class FunctorType >
inline
void parallel_for( const ExecPolicy & policy
, const FunctorType & functor
, const std::string& str = ""
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
)
{
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID);
}
#endif
}
template< class FunctorType >
inline
void parallel_for( const size_t work_count
, const FunctorType & functor
, const std::string& str = ""
)
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > policy ;
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID);
}
#endif
}
template< class ExecPolicy , class FunctorType >
inline
void parallel_for( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor )
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
#endif
parallel_for(policy,functor,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_for kernel: " << str << std::endl;
#endif
(void) str;
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief Parallel reduction
*
* Example of a parallel_reduce functor for a POD (plain old data) value type:
* \code
* class FunctorType { // For POD value type
* public:
* typedef ... execution_space ;
* typedef <podType> value_type ;
* void operator()( <intType> iwork , <podType> & update ) const ;
* void init( <podType> & update ) const ;
* void join( volatile <podType> & update ,
* volatile const <podType> & input ) const ;
*
* typedef true_type has_final ;
* void final( <podType> & update ) const ;
* };
* \endcode
*
* Example of a parallel_reduce functor for an array of POD (plain old data) values:
* \code
* class FunctorType { // For array of POD value
* public:
* typedef ... execution_space ;
* typedef <podType> value_type[] ;
* void operator()( <intType> , <podType> update[] ) const ;
* void init( <podType> update[] ) const ;
* void join( volatile <podType> update[] ,
* volatile const <podType> input[] ) const ;
*
* typedef true_type has_final ;
* void final( <podType> update[] ) const ;
* };
* \endcode
*/
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const std::string& str = ""
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
)
{
// typedef typename
// Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
// execution_space ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const std::string& str = ""
)
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and view ouput
template< class ExecPolicy , class FunctorType , class ViewType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<
( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and pod or array of pod output
template< class ExecPolicy , class FunctorType >
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
#ifdef KOKKOS_HAVE_CUDA
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value &&
! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
, const std::string& str = ""
, typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
)
#else
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value)
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
>::type result_ref
, const std::string& str = ""
)
#endif
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result_ref )
, ValueTraits::value_count( functor )
);
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and view ouput
template< class FunctorType , class ViewType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<( Impl::is_view<ViewType>::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > ExecPolicy ;
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and pod or array of pod output
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, typename Kokkos::Impl::FunctorValueTraits<
typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
Impl::is_integral<FunctorType>::value,
void,FunctorType>::type
, void >::reference_type result
, const std::string& str = ""
, typename Impl::enable_if< true
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
>::type * = 0 )
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;
typedef typename
Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef Kokkos::RangePolicy< execution_space > policy ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result )
, ValueTraits::value_count( functor )
);
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType * result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType & result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
/// \fn parallel_scan
/// \tparam ExecutionPolicy The execution policy type.
/// \tparam FunctorType The scan functor type.
///
/// \param policy [in] The execution policy.
/// \param functor [in] The scan functor.
///
/// This function implements a parallel scan pattern. The scan can
/// be either inclusive or exclusive, depending on how you implement
/// the scan functor.
///
/// A scan functor looks almost exactly like a reduce functor, except
/// that its operator() takes a third \c bool argument, \c final_pass,
/// which indicates whether this is the last pass of the scan
/// operation. We will show below how to use the \c final_pass
/// argument to control whether the scan is inclusive or exclusive.
///
/// Here is the minimum required interface of a scan functor for a POD
/// (plain old data) value type \c PodType. That is, the result is a
/// View of zero or more PodType. It is also possible for the result
/// to be an array of (same-sized) arrays of PodType, but we do not
/// show the required interface for that here.
/// \code
/// template< class ExecPolicy , class FunctorType >
/// class ScanFunctor {
/// public:
/// // The Kokkos device type
/// typedef ... execution_space;
/// // Type of an entry of the array containing the result;
/// // also the type of each of the entries combined using
/// // operator() or join().
/// typedef PodType value_type;
///
/// void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
/// void init (value_type& update) const;
/// void join (volatile value_type& update, volatile const value_type& input) const
/// };
/// \endcode
///
/// Here is an example of a functor which computes an inclusive plus-scan
/// of an array of \c int, in place. If given an array [1, 2, 3, 4], this
/// scan will overwrite that array with [1, 3, 6, 10].
///
/// \code
/// template<class SpaceType>
/// class InclScanFunctor {
/// public:
/// typedef SpaceType execution_space;
/// typedef int value_type;
/// typedef typename SpaceType::size_type size_type;
///
/// InclScanFunctor( Kokkos::View<value_type*, execution_space> x
/// , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
///
/// void operator () (const size_type i, value_type& update, const bool final_pass) const {
/// update += m_x(i);
/// if (final_pass) {
/// m_y(i) = update;
/// }
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, execution_space> m_x;
/// Kokkos::View<value_type*, execution_space> m_y;
/// };
/// \endcode
///
/// Here is an example of a functor which computes an <i>exclusive</i>
/// scan of an array of \c int, in place. In operator(), note both
/// that the final_pass test and the update have switched places, and
/// the use of a temporary. If given an array [1, 2, 3, 4], this scan
/// will overwrite that array with [0, 1, 3, 6].
///
/// \code
/// template<class SpaceType>
/// class ExclScanFunctor {
/// public:
/// typedef SpaceType execution_space;
/// typedef int value_type;
/// typedef typename SpaceType::size_type size_type;
///
/// ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
///
/// void operator () (const size_type i, value_type& update, const bool final_pass) const {
/// const value_type x_i = x_(i);
/// if (final_pass) {
/// x_(i) = update;
/// }
/// update += x_i;
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, execution_space> x_;
/// };
/// \endcode
///
/// Here is an example of a functor which builds on the above
/// exclusive scan example, to compute an offsets array from a
/// population count array, in place. We assume that the pop count
/// array has an extra entry at the end to store the final count. If
/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
/// array with [0, 1, 3, 6, 10].
///
/// \code
/// template<class SpaceType>
/// class OffsetScanFunctor {
/// public:
/// typedef SpaceType execution_space;
/// typedef int value_type;
/// typedef typename SpaceType::size_type size_type;
///
/// // lastIndex_ is the last valid index (zero-based) of x.
/// // If x has length zero, then lastIndex_ won't be used anyway.
/// OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
/// , Kokkos::View<value_type*, execution_space> y )
/// : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
/// {}
///
/// void operator () (const size_type i, int& update, const bool final_pass) const {
/// if (final_pass) {
/// m_y(i) = update;
/// }
/// update += m_x(i);
/// // The last entry of m_y gets the final sum.
/// if (final_pass && i == last_index_) {
/// m_y(i+1) = update;
/// }
/// }
/// void init (value_type& update) const {
/// update = 0;
/// }
/// void join (volatile value_type& update, volatile const value_type& input) const {
/// update += input;
/// }
///
/// private:
/// Kokkos::View<value_type*, execution_space> m_x;
/// Kokkos::View<value_type*, execution_space> m_y;
/// const size_type last_index_;
/// };
/// \endcode
///
template< class ExecutionPolicy , class FunctorType >
inline
void parallel_scan( const ExecutionPolicy & policy
, const FunctorType & functor
, const std::string& str = ""
, typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
)
{
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID);
}
#endif
}
template< class FunctorType >
inline
void parallel_scan( const size_t work_count
, const FunctorType & functor
, const std::string& str = "" )
{
typedef typename
Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef Kokkos::RangePolicy< execution_space > policy ;
#ifdef KOKKOSP_ENABLE_PROFILING
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
(void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID);
}
#endif
}
template< class ExecutionPolicy , class FunctorType >
inline
void parallel_scan( const std::string& str
, const ExecutionPolicy & policy
, const FunctorType & functor)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
#endif
parallel_scan(policy,functor,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_scan kernel: " << str << std::endl;
#endif
(void) str;
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Enable = void >
struct FunctorTeamShmemSize
{
static inline size_t value( const FunctorType & , int ) { return 0 ; }
};
template< class FunctorType >
struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type >
{
static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
};
template< class FunctorType >
struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
{
static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_PARALLEL_HPP */

View File

@ -0,0 +1,165 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_QTHREAD_HPP
#define KOKKOS_QTHREAD_HPP
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Core.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class QthreadExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Execution space supported by Qthread */
class Qthread {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as an execution space
typedef Qthread execution_space ;
typedef Kokkos::HostSpace memory_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef Kokkos::LayoutRight array_layout ;
typedef memory_space::size_type size_type ;
typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
//@}
/*------------------------------------------------------------------------*/
/** \brief Initialization will construct one or more instances */
static Qthread & instance( int = 0 );
/** \brief Set the execution space to a "sleep" state.
*
* This function sets the "sleep" state in which it is not ready for work.
* This may consume less resources than in an "ready" state,
* but it may also take time to transition to the "ready" state.
*
* \return True if enters or is in the "sleep" state.
* False if functions are currently executing.
*/
bool sleep();
/** \brief Wake from the sleep state.
*
* \return True if enters or is in the "ready" state.
* False if functions are currently executing.
*/
static bool wake();
/** \brief Wait until all dispatched functions to complete.
*
* The parallel_for or parallel_reduce dispatch of a functor may
* return asynchronously, before the functor completes. This
* method does not return until all dispatched functors on this
* device have completed.
*/
static void fence();
/*------------------------------------------------------------------------*/
static void initialize( int thread_count );
static void finalize();
/** \brief Print configuration information to the given output stream. */
static void print_configuration( std::ostream & , const bool detail = false );
int shepherd_size() const ;
int shepherd_worker_size() const ;
};
/*--------------------------------------------------------------------------*/
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Qthread::memory_space
, Kokkos::Qthread::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <Kokkos_Parallel.hpp>
#include <Qthread/Kokkos_QthreadExec.hpp>
#include <Qthread/Kokkos_Qthread_Parallel.hpp>
#endif /* #define KOKKOS_QTHREAD_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,125 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_SCRATCHSPACE_HPP
#define KOKKOS_SCRATCHSPACE_HPP
#include <stdio.h>
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Scratch memory space associated with an execution space.
*
*/
template< class ExecSpace >
class ScratchMemorySpace {
public:
// Alignment of memory chunks returned by 'get'
// must be a power of two
enum { ALIGN = 8 };
private:
mutable char * m_iter ;
char * m_end ;
ScratchMemorySpace();
ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
public:
//! Tag this class as a memory space
typedef ScratchMemorySpace memory_space ;
typedef ExecSpace execution_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef typename ExecSpace::array_layout array_layout ;
typedef typename ExecSpace::size_type size_type ;
template< typename IntType >
KOKKOS_INLINE_FUNCTION static
IntType align( const IntType & size )
{ return ( size + MASK ) & ~MASK ; }
template< typename IntType >
KOKKOS_INLINE_FUNCTION
void* get_shmem (const IntType& size) const {
void* tmp = m_iter ;
if (m_end < (m_iter += align (size))) {
m_iter -= align (size); // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end-m_iter));
#endif // KOKKOS_HAVE_DEBUG
tmp = 0;
}
return tmp;
}
template< typename IntType >
KOKKOS_INLINE_FUNCTION
ScratchMemorySpace( void * ptr , const IntType & size )
: m_iter( (char *) ptr )
, m_end( m_iter + size )
{}
};
} // namespace Kokkos
#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,892 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Serial.hpp
/// \brief Declaration and definition of Kokkos::Serial device.
#ifndef KOKKOS_SERIAL_HPP
#define KOKKOS_SERIAL_HPP
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
namespace Kokkos {
/// \class Serial
/// \brief Kokkos device for non-parallel execution
///
/// A "device" represents a parallel execution model. It tells Kokkos
/// how to parallelize the execution of kernels in a parallel_for or
/// parallel_reduce. For example, the Threads device uses Pthreads or
/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
/// extensions, and the Cuda device uses NVIDIA's CUDA programming
/// model. The Serial device executes "parallel" kernels
/// sequentially. This is useful if you really do not want to use
/// threads, or if you want to explore different combinations of MPI
/// and shared-memory parallel programming models.
class Serial {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as an execution space:
typedef Serial execution_space ;
//! The size_type typedef best suited for this device.
typedef HostSpace::size_type size_type ;
//! This device's preferred memory space.
typedef HostSpace memory_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
//! This device's preferred array layout.
typedef LayoutRight array_layout ;
/// \brief Scratch memory space
typedef ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
//@}
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
///
/// For the Serial device, this method <i>always</i> returns false,
/// because parallel_for or parallel_reduce with the Serial device
/// always execute sequentially.
inline static int in_parallel() { return false ; }
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence() {}
static void initialize( unsigned threads_count = 1 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 ,
bool allow_asynchronous_threadpool = false) {
(void) threads_count;
(void) use_numa_count;
(void) use_cores_per_numa;
(void) allow_asynchronous_threadpool;
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
}
static int is_initialized() { return 1 ; }
//! Free any resources being consumed by the device.
static void finalize() {}
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false ) {}
//--------------------------------------------------------------------------
inline static int thread_pool_size( int = 0 ) { return 1 ; }
KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
//--------------------------------------------------------------------------
static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
//--------------------------------------------------------------------------
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Serial::memory_space
, Kokkos::Serial::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
namespace SerialImpl {
struct Sentinel {
void * m_scratch ;
unsigned m_reduce_end ;
unsigned m_shared_end ;
Sentinel();
~Sentinel();
static Sentinel & singleton();
};
inline
unsigned align( unsigned n );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class SerialTeamMember {
private:
typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
const scratch_memory_space m_space ;
const int m_league_rank ;
const int m_league_size ;
SerialTeamMember & operator = ( const SerialTeamMember & );
public:
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_shmem() const { return m_space ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(const ValueType& , const int& ) const {}
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION
ValueType team_reduce( const ValueType & value , const JoinOp & ) const
{
return value ;
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
{
const Type tmp = global_accum ? *global_accum : Type(0) ;
if ( global_accum ) { *global_accum += value ; }
return tmp ;
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
{ return Type(0); }
//----------------------------------------
// Execution space specific:
SerialTeamMember( int arg_league_rank
, int arg_league_size
, int arg_shared_size
);
};
} // namespace Impl
/*
* < Kokkos::Serial , WorkArgTag >
* < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
*
*/
template< class Arg0 , class Arg1 >
class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
{
private:
const int m_league_size ;
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicy execution_policy ;
//! Execution space of this execution policy:
typedef Kokkos::Serial execution_space ;
typedef typename
Impl::if_c< ! Impl::is_same< Kokkos::Serial , Arg0 >::value , Arg0 , Arg1 >::type
work_tag ;
//----------------------------------------
template< class FunctorType >
static
int team_size_max( const FunctorType & ) { return 1 ; }
template< class FunctorType >
static
int team_size_recommended( const FunctorType & ) { return 1 ; }
template< class FunctorType >
static
int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
//----------------------------------------
inline int team_size() const { return 1 ; }
inline int league_size() const { return m_league_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
: m_league_size( league_size_request )
{ (void) vector_length_request; }
TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
: m_league_size( league_size_request )
{ (void) vector_length_request; }
typedef Impl::SerialTeamMember member_type ;
};
} /* namespace Kokkos */
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
public:
// work tag is void
template< class PType >
inline
ParallelFor( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
{
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i );
}
}
// work tag is non-void
template< class PType >
inline
ParallelFor( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
{
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
}
}
};
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
{
public:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
// Work tag is void
template< class ViewType , class PType >
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy
, const ViewType & result
)
{
pointer_type result_ptr = result.ptr_on_device();
if ( ! result_ptr ) {
result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
}
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
}
// Work tag is non-void
template< class ViewType , class PType >
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy
, const ViewType & result
)
{
pointer_type result_ptr = result.ptr_on_device();
if ( ! result_ptr ) {
result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
}
typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
}
};
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
// work tag is void
template< class PType >
inline
ParallelScan( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
{
pointer_type result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i , update , true );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
}
// work tag is non-void
template< class PType >
inline
ParallelScan( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
{
pointer_type result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , true );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
}
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( member ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( TagType() , member ); }
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
{
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
ParallelFor::template driver< typename Policy::work_tag >
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
// functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
}
}
};
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
private:
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member
, reference_type update )
{ functor( member , update ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member
, reference_type update )
{ functor( TagType() , member , update ); }
public:
template< class ViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const ViewType & result
)
{
const int reduce_size = ValueTraits::value_size( functor );
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
const pointer_type result_ptr =
result.ptr_on_device() ? result.ptr_on_device()
: (pointer_type) scratch_reduce ;
reference_type update = ValueInit::init( functor , result_ptr );
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
ParallelReduce::template driver< typename Policy::work_tag >
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
}
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
typedef iType index_type;
const iType begin ;
const iType end ;
enum {increment = 1};
const SerialTeamMember& thread;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
: begin(0)
, end(arg_count)
, thread(arg_thread)
{}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
: begin( arg_begin )
, end( arg_end)
, thread( arg_thread )
{}
};
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
typedef iType index_type;
enum {start = 0};
const iType end;
enum {increment = 1};
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
end( count )
{}
};
} // namespace Impl
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
#ifdef KOKKOS_HAVE_CXX11
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
}
#endif // KOKKOS_HAVE_CXX11
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
lambda(val);
}
}
#endif // defined( KOKKOS_HAVE_SERIAL )
#endif /* #define KOKKOS_SERIAL_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,376 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_TASKPOLICY_HPP
#define KOKKOS_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
struct FutureValueTypeIsVoidError {};
template < class ExecSpace , class ResultType , class FunctorType >
class TaskMember ;
template< class ExecPolicy , class ResultType , class FunctorType >
class TaskForEach ;
template< class ExecPolicy , class ResultType , class FunctorType >
class TaskReduce ;
template< class ExecPolicy , class ResultType , class FunctorType >
struct TaskScan ;
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
/**\brief States of a task */
enum TaskState
{ TASK_STATE_NULL = 0 ///< Does not exist
, TASK_STATE_CONSTRUCTING = 1 ///< Is under construction
, TASK_STATE_WAITING = 2 ///< Is waiting for execution
, TASK_STATE_EXECUTING = 4 ///< Is executing
, TASK_STATE_COMPLETE = 8 ///< Execution is complete
};
/**
*
* Future< space > // value_type == void
* Future< value > // space == Default
* Future< value , space >
*
*/
template< class Arg1 = void , class Arg2 = void >
class Future {
private:
template< class , class , class > friend class Impl::TaskMember ;
template< class > friend class TaskPolicy ;
template< class , class > friend class Future ;
// Argument #2, if not void, must be the space.
enum { Arg1_is_space = Kokkos::Impl::is_execution_space< Arg1 >::value };
enum { Arg2_is_space = Kokkos::Impl::is_execution_space< Arg2 >::value };
enum { Arg2_is_void = Kokkos::Impl::is_same< Arg2 , void >::value };
struct ErrorNoExecutionSpace {};
enum { Opt1 = Arg1_is_space && Arg2_is_void
, Opt2 = ! Arg1_is_space && Arg2_is_void
, Opt3 = ! Arg1_is_space && Arg2_is_space
, OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
};
typedef typename
Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
ValueType ;
typedef typename
Kokkos::Impl::if_c< Opt1 , Arg1 , typename
Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
Kokkos::Impl::if_c< Opt3 , Arg2 , void
>::type >::type >::type
ExecutionSpace ;
typedef Impl::TaskMember< ExecutionSpace , void , void > TaskRoot ;
typedef Impl::TaskMember< ExecutionSpace , ValueType , void > TaskValue ;
TaskRoot * m_task ;
public:
typedef ValueType value_type;
typedef ExecutionSpace execution_space ;
//----------------------------------------
KOKKOS_INLINE_FUNCTION
TaskState get_task_state() const
{ return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
//----------------------------------------
explicit
Future( TaskRoot * task )
: m_task(0)
{ TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( task ) ); }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
~Future() { TaskRoot::assign( & m_task , 0 ); }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
Future() : m_task(0) {}
KOKKOS_INLINE_FUNCTION
Future( const Future & rhs )
: m_task(0)
{ TaskRoot::assign( & m_task , rhs.m_task ); }
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future & rhs )
{ TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
//----------------------------------------
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future( const Future<A1,A2> & rhs )
: m_task(0)
{ TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); }
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future<A1,A2> & rhs )
{ TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; }
//----------------------------------------
typedef typename TaskValue::get_result_type get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const
{ return static_cast<TaskValue*>( m_task )->get(); }
};
namespace Impl {
template< class T >
struct is_future : public Kokkos::Impl::bool_< false > {};
template< class Arg0 , class Arg1 >
struct is_future< Kokkos::Experimental::Future<Arg0,Arg1> > : public Kokkos::Impl::bool_< true > {};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
/** \brief If the argument is an execution space then a serial task in that space */
template< class Arg0 = Kokkos::DefaultExecutionSpace >
class TaskPolicy {
public:
typedef typename Arg0::execution_space execution_space ;
//----------------------------------------
/** \brief Create a serial task with storage for dependences.
*
* Postcondition: Task is in the 'constructing' state.
*/
template< class FunctorType >
Future< typename FunctorType::value_type , execution_space >
create( const FunctorType & functor
, const unsigned dependence_capacity /* = default */ ) const ;
/** \brief Create a foreach task with storage for dependences. */
template< class ExecPolicy , class FunctorType >
Future< typename FunctorType::value_type , execution_space >
create_foreach( const ExecPolicy & policy
, const FunctorType & functor
, const unsigned dependence_capacity /* = default */ ) const ;
/** \brief Create a reduce task with storage for dependences. */
template< class ExecPolicy , class FunctorType >
Future< typename FunctorType::value_type , execution_space >
create_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const unsigned dependence_capacity /* = default */ ) const ;
/** \brief Create a scan task with storage for dependences. */
template< class ExecPolicy , class FunctorType >
Future< typename FunctorType::value_type , execution_space >
create_scan( const ExecPolicy & policy
, const FunctorType & functor
, const unsigned dependence_capacity /* = default */ ) const ;
/** \brief Set dependence that 'after' cannot start execution
* until 'before' has completed.
*
* Precondition: The 'after' task must be in then 'Constructing' state.
*/
template< class TA , class TB >
void set_dependence( const Future<TA,execution_space> & after
, const Future<TB,execution_space> & before ) const ;
/** \brief Spawn a task in the 'Constructing' state
*
* Precondition: Task is in the 'constructing' state.
* Postcondition: Task is waiting, executing, or complete.
*/
template< class T >
const Future<T,execution_space> &
spawn( const Future<T,execution_space> & ) const ;
//----------------------------------------
/** \brief Query dependence of an executing task */
template< class FunctorType >
Future< execution_space >
get_dependence( FunctorType * , const int ) const ;
//----------------------------------------
/** \brief Clear current dependences of an executing task
* in preparation for setting new dependences and
* respawning.
*
* Precondition: The functor must be a task in the executing state.
*/
template< class FunctorType >
void clear_dependence( FunctorType * ) const ;
/** \brief Set dependence that 'after' cannot start execution
* until 'before' has completed.
*
* The 'after' functor must be in the executing state
*/
template< class FunctorType , class TB >
void set_dependence( FunctorType * after
, const Future<TB,execution_space> & before ) const ;
/** \brief Respawn (reschedule) an executing task to be called again
* after all dependences have completed.
*/
template< class FunctorType >
void respawn( FunctorType * ) const ;
};
//----------------------------------------------------------------------------
/** \brief Create and spawn a single-thread task */
template< class ExecSpace , class FunctorType >
inline
Future< typename FunctorType::value_type , ExecSpace >
spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor )
{ return policy.spawn( policy.create( functor ) ); }
/** \brief Create and spawn a single-thread task with dependences */
template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
inline
Future< typename FunctorType::value_type , ExecSpace >
spawn( TaskPolicy<ExecSpace> & policy
, const FunctorType & functor
, const Future<Arg0,Arg1> & before_0
, const Future<Arg0,Arg1> & before_1 )
{
Future< typename FunctorType::value_type , ExecSpace > f ;
f = policy.create( functor , 2 );
policy.add_dependence( f , before_0 );
policy.add_dependence( f , before_1 );
policy.spawn( f );
return f ;
}
//----------------------------------------------------------------------------
/** \brief Create and spawn a parallel_for task */
template< class ExecSpace , class ParallelPolicyType , class FunctorType >
inline
Future< typename FunctorType::value_type , ExecSpace >
spawn_foreach( TaskPolicy<ExecSpace> & task_policy
, const ParallelPolicyType & parallel_policy
, const FunctorType & functor )
{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); }
/** \brief Create and spawn a parallel_reduce task */
template< class ExecSpace , class ParallelPolicyType , class FunctorType >
inline
Future< typename FunctorType::value_type , ExecSpace >
spawn_reduce( TaskPolicy<ExecSpace> & task_policy
, const ParallelPolicyType & parallel_policy
, const FunctorType & functor )
{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); }
//----------------------------------------------------------------------------
/** \brief Respawn a task functor with dependences */
template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
inline
void respawn( TaskPolicy<ExecSpace> & policy
, FunctorType * functor
, const Future<Arg0,Arg1> & before_0
, const Future<Arg0,Arg1> & before_1
)
{
policy.clear_dependence( functor );
policy.add_dependence( functor , before_0 );
policy.add_dependence( functor , before_1 );
policy.respawn( functor );
}
//----------------------------------------------------------------------------
template< class ExecSpace >
void wait( TaskPolicy< ExecSpace > & );
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_TASKPOLICY_HPP */

View File

@ -0,0 +1,217 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADS_HPP
#define KOKKOS_THREADS_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_PTHREAD )
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class ThreadsExec ;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Execution space for a pool of Pthreads or C11 threads on a CPU. */
class Threads {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as a kokkos execution space
typedef Threads execution_space ;
typedef Kokkos::HostSpace memory_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef Kokkos::LayoutRight array_layout ;
typedef memory_space::size_type size_type ;
typedef ScratchMemorySpace< Threads > scratch_memory_space ;
//@}
/*------------------------------------------------------------------------*/
//! \name Static functions that all Kokkos devices must implement.
//@{
/// \brief True if and only if this method is being called in a
/// thread-parallel function.
static int in_parallel();
/** \brief Set the device in a "sleep" state.
*
* This function sets the device in a "sleep" state in which it is
* not ready for work. This may consume less resources than if the
* device were in an "awake" state, but it may also take time to
* bring the device from a sleep state to be ready for work.
*
* \return True if the device is in the "sleep" state, else false if
* the device is actively working and could not enter the "sleep"
* state.
*/
static bool sleep();
/// \brief Wake the device from the 'sleep' state so it is ready for work.
///
/// \return True if the device is in the "ready" state, else "false"
/// if the device is actively working (which also means that it's
/// awake).
static bool wake();
/// \brief Wait until all dispatched functors complete.
///
/// The parallel_for or parallel_reduce dispatch of a functor may
/// return asynchronously, before the functor completes. This
/// method does not return until all dispatched functors on this
/// device have completed.
static void fence();
/// \brief Free any resources being consumed by the device.
///
/// For the Threads device, this terminates spawned worker threads.
static void finalize();
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
//@}
/*------------------------------------------------------------------------*/
/*------------------------------------------------------------------------*/
//! \name Space-specific functions
//@{
/** \brief Initialize the device in the "ready to work" state.
*
* The device is initialized in a "ready to work" or "awake" state.
* This state reduces latency and thus improves performance when
* dispatching work. However, the "awake" state consumes resources
* even when no work is being done. You may call sleep() to put
* the device in a "sleeping" state that does not consume as many
* resources, but it will take time (latency) to awaken the device
* again (via the wake()) method so that it is ready for work.
*
* Teams of threads are distributed as evenly as possible across
* the requested number of numa regions and cores per numa region.
* A team will not be split across a numa region.
*
* If the 'use_' arguments are not supplied the hwloc is queried
* to use all available cores.
*/
static void initialize( unsigned threads_count = 0 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 ,
bool allow_asynchronous_threadpool = false );
static int is_initialized();
static Threads & instance( int = 0 );
//----------------------------------------
static int thread_pool_size( int depth = 0 );
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
static int thread_pool_rank();
#else
KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
#endif
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
//@}
//----------------------------------------
};
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Threads::memory_space
, Kokkos::Threads::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#include <Threads/Kokkos_ThreadsExec.hpp>
#include <Threads/Kokkos_ThreadsTeam.hpp>
#include <Threads/Kokkos_Threads_Parallel.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
#endif /* #define KOKKOS_THREADS_HPP */

View File

@ -0,0 +1,53 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
/// \file Kokkos_Vectorization.hpp
/// \brief Declaration and definition of Kokkos::Vectorization interface.
#ifndef KOKKOS_VECTORIZATION_HPP
#define KOKKOS_VECTORIZATION_HPP
#if defined( KOKKOS_HAVE_CUDA )
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,140 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HWLOC_HPP
#define KOKKOS_HWLOC_HPP
#include <utility>
namespace Kokkos {
/** \brief Minimal subset of logical 'hwloc' functionality available
* from http://www.open-mpi.org/projects/hwloc/.
*
* The calls are NOT thread safe in order to avoid mutexes,
* memory allocations, or other actions which could give the
* runtime system an opportunity to migrate the threads or
* touch allocated memory during the function calls.
*
* All calls to these functions should be performed by a thread
* when it has guaranteed exclusive access; e.g., for OpenMP
* within a 'critical' region.
*/
namespace hwloc {
/** \brief Query if hwloc is available */
bool available();
/** \brief Query number of available NUMA regions.
* This will be less than the hardware capacity
* if the MPI process is pinned to a NUMA region.
*/
unsigned get_available_numa_count();
/** \brief Query number of available cores per NUMA regions.
* This will be less than the hardware capacity
* if the MPI process is pinned to a set of cores.
*/
unsigned get_available_cores_per_numa();
/** \brief Query number of available "hard" threads per core; i.e., hyperthreads */
unsigned get_available_threads_per_core();
} /* namespace hwloc */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Internal functions for binding persistent spawned threads.
namespace Kokkos {
namespace hwloc {
/** \brief Recommend mapping of threads onto cores.
*
* If thread_count == 0 then choose and set a value.
* If use_numa_count == 0 then choose and set a value.
* If use_cores_per_numa == 0 then choose and set a value.
*
* Return 0 if asynchronous,
* Return 1 if synchronous and threads_coord[0] is process core
*/
unsigned thread_mapping( const char * const label ,
const bool allow_async ,
unsigned & thread_count ,
unsigned & use_numa_count ,
unsigned & use_cores_per_numa ,
std::pair<unsigned,unsigned> threads_coord[] );
/** \brief Query core-coordinate of the current thread
* with respect to the core_topology.
*
* As long as the thread is running within the
* process binding the following condition holds.
*
* core_coordinate.first < core_topology.first
* core_coordinate.second < core_topology.second
*/
std::pair<unsigned,unsigned> get_this_thread_coordinate();
/** \brief Bind the current thread to a core. */
bool bind_this_thread( const std::pair<unsigned,unsigned> );
/** \brief Bind the current thread to one of the cores in the list.
* Set that entry to (~0,~0) and return the index.
* If binding fails return ~0.
*/
unsigned bind_this_thread( const unsigned coordinate_count ,
std::pair<unsigned,unsigned> coordinate[] );
/** \brief Unbind the current thread back to the original process binding */
bool unbind_this_thread();
} /* namespace hwloc */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_HWLOC_HPP */

118
lib/kokkos/core/src/Makefile Executable file
View File

@ -0,0 +1,118 @@
KOKKOS_PATH = ../..
PREFIX ?= /usr/local/lib/kokkos
default: messages build-lib
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXXFLAGS ?= -O3
LINK = nvcc_wrapper
LINKFLAGS ?=
else
CXX ?= g++
CXXFLAGS ?= -O3
LINK ?= g++
LINKFLAGS ?=
endif
PWD = $(shell pwd)
KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
CONDITIONAL_COPIES =
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
CONDITIONAL_COPIES += copy-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
CONDITIONAL_COPIES += copy-threads
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
CONDITIONAL_COPIES += copy-openmp
endif
messages:
echo "Start Build"
build-makefile-kokkos:
rm -f Makefile.kokkos
echo "#Global Settings used to generate this library" >> Makefile.kokkos
echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos
echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos
echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos
echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
echo "CXX ?= $(CXX)" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Variables used in application Makefiles" >> Makefile.kokkos
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
echo "KOKKOS_LINK_DEPENDS = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos
echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos
echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos
sed \
-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \
> Makefile.kokkos.tmp
mv -f Makefile.kokkos.tmp Makefile.kokkos
build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
mkdir:
mkdir -p $(PREFIX)
mkdir -p $(PREFIX)/include
mkdir -p $(PREFIX)/lib
mkdir -p $(PREFIX)/include/impl
copy-cuda: mkdir
mkdir -p $(PREFIX)/include/Cuda
cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
copy-threads: mkdir
mkdir -p $(PREFIX)/include/Threads
cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
copy-openmp: mkdir
mkdir -p $(PREFIX)/include/OpenMP
cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
install: mkdir $(CONDITIONAL_COPIES) build-lib
cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
cp Makefile.kokkos $(PREFIX)
cp libkokkos.a $(PREFIX)/lib
cp KokkosCore_config.h $(PREFIX)/include
clean: kokkos-clean
rm Makefile.kokkos

View File

@ -0,0 +1,496 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMP_PARALLEL_HPP
#define KOKKOS_OPENMP_PARALLEL_HPP
#include <omp.h>
#include <Kokkos_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( iwork );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork );
}
}
public:
inline
ParallelFor( const FunctorType & functor
, const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
}
/* END #pragma omp parallel */
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( iwork , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork , update );
}
}
public:
//----------------------------------------
template< class ViewType >
inline
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value
), const FunctorType & >::type functor
, const Policy & policy
, const ViewType & result_view )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor
, ValueInit::init( functor , exec.scratch_reduce() )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
);
}
/* END #pragma omp parallel */
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
if ( result_view.ptr_on_device() ) {
const int n = ValueTraits::value_count( functor );
for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
}
}
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , WorkTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range
, const bool final )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( iwork , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range
, const bool final )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork , update , final );
}
}
public:
//----------------------------------------
inline
ParallelScan( const FunctorType & functor
, const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor
, ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
, false );
}
/* END #pragma omp parallel */
{
const unsigned thread_count = OpenMPexec::pool_size();
const unsigned value_count = ValueTraits::value_count( functor );
pointer_type ptr_prev = 0 ;
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
if ( ptr_prev ) {
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
ValueJoin::join( functor , ptr + value_count , ptr );
}
else {
ValueInit::init( functor , ptr );
}
ptr_prev = ptr ;
}
}
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor
, ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
, true );
}
/* END #pragma omp parallel */
}
//----------------------------------------
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( member ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( TagType() , member ); }
public:
inline
ParallelFor( const FunctorType & functor ,
const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
#pragma omp parallel
{
typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
for ( ; member.valid() ; member.next() ) {
ParallelFor::template driver< typename Policy::work_tag >( functor , member );
}
}
/* END #pragma omp parallel */
}
void wait() {}
};
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const typename PType::member_type & member
, reference_type update )
{ functor( member , update ); }
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const typename PType::member_type & member
, reference_type update )
{ functor( typename PType::work_tag() , member , update ); }
public:
inline
ParallelReduce( const FunctorType & functor ,
const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
ParallelReduce::template driver< Policy >( functor , member , update );
}
}
/* END #pragma omp parallel */
{
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type > Join ;
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
}
}
template< class ViewType >
inline
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const ViewType & result )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
ParallelReduce::template driver< Policy >( functor , member , update );
}
}
/* END #pragma omp parallel */
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
const int n = ValueTraits::value_count( functor );
for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
}
}
void wait() {}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_OPENMP_PARALLEL_HPP */

View File

@ -0,0 +1,364 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdio.h>
#include <limits>
#include <iostream>
#include <vector>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <iostream>
#ifdef KOKKOS_HAVE_OPENMP
namespace Kokkos {
namespace Impl {
namespace {
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel();
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel()
{
#ifndef __CUDA_ARCH__
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
#else
return 0;
#endif
}
bool s_using_hwloc = false;
} // namespace
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
OpenMPexec::Pool OpenMPexec::m_pool;
void OpenMPexec::verify_is_process( const char * const label )
{
if ( omp_in_parallel() ) {
std::string msg( label );
msg.append( " ERROR: in parallel" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void OpenMPexec::verify_initialized( const char * const label )
{
if ( 0 == m_pool[0] ) {
std::string msg( label );
msg.append( " ERROR: not initialized" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void OpenMPexec::clear_scratch()
{
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
m_pool.at(rank_rev).clear();
}
/* END #pragma omp parallel */
}
void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
{
enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
// Requesting allocation and old allocation is too small:
const bool allocate = ( old_reduce_size < reduce_size ) ||
( old_thread_size < thread_size );
if ( allocate ) {
if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
}
const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
const int pool_size = m_pool_topo[0] ;
if ( allocate ) {
clear_scratch();
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
const int rank = pool_size - ( rank_rev + 1 );
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
}
/* END #pragma omp parallel */
}
}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//----------------------------------------------------------------------------
int OpenMP::is_initialized()
{ return 0 != Impl::OpenMPexec::m_pool[0]; }
void OpenMP::initialize( unsigned thread_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa )
{
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
//Using omp_get_max_threads(); is problematic in conjunction with
//Hwloc on Intel (essentially an initial call to the OpenMP runtime
//without a parallel region before will set a process mask for a single core
//The runtime will than bind threads for a parallel region to other cores on the
//entering the first parallel region and make the process mask the aggregate of
//the thread masks. The intend seems to be to make serial code run fast, if you
//compile with OpenMP enabled but don't actually use parallel regions or so
//static int omp_max_threads = omp_get_max_threads();
int nthreads = 0;
#pragma omp parallel
{
#pragma omp atomic
nthreads++;
}
static int omp_max_threads = nthreads;
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
bool thread_spawn_failed = false ;
if ( ! is_initialized ) {
// Use hwloc thread pinning if concerned with locality.
// If spreading threads across multiple NUMA regions.
// If hyperthreading is enabled.
Impl::s_using_hwloc = hwloc::available() && (
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
// If hwloc available then use it's maximum value.
if ( thread_count == 0 ) {
thread_count = Impl::s_using_hwloc
? Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa() *
Kokkos::hwloc::get_available_threads_per_core()
: omp_max_threads ;
}
if(Impl::s_using_hwloc)
hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
false /* do not allow asynchronous */ ,
thread_count ,
use_numa_count ,
use_cores_per_numa ,
threads_coord );
// Spawn threads:
omp_set_num_threads( thread_count );
// Verify OMP interaction:
if ( int(thread_count) != omp_get_max_threads() ) {
thread_spawn_failed = true ;
}
// Verify spawning and bind threads:
#pragma omp parallel
{
#pragma omp critical
{
if ( int(thread_count) != omp_get_num_threads() ) {
thread_spawn_failed = true ;
}
// Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
// Call to 'new' may not be thread safe as well.
// Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
const unsigned omp_rank = omp_get_thread_num();
const unsigned thread_r = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
}
/* END #pragma omp critical */
}
/* END #pragma omp parallel */
if ( ! thread_spawn_failed ) {
Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
}
}
if ( is_initialized || thread_spawn_failed ) {
std::string msg("Kokkos::OpenMP::initialize ERROR");
if ( is_initialized ) { msg.append(" : already initialized"); }
if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
Kokkos::Impl::throw_runtime_exception(msg);
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
}
//----------------------------------------------------------------------------
void OpenMP::finalize()
{
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
Impl::OpenMPexec::clear_scratch();
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
Impl::OpenMPexec::m_pool_topo[2] = 0 ;
omp_set_num_threads(1);
if ( Impl::s_using_hwloc ) {
hwloc::unbind_this_thread();
}
}
//----------------------------------------------------------------------------
void OpenMP::print_configuration( std::ostream & s , const bool detail )
{
Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
s << "Kokkos::OpenMP" ;
#if defined( KOKKOS_HAVE_OPENMP )
s << " KOKKOS_HAVE_OPENMP" ;
#endif
#if defined( KOKKOS_HAVE_HWLOC )
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
;
#endif
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
if ( is_initialized ) {
const int numa_count = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
const int core_per_numa = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
s << " thread_pool_topology[ " << numa_count
<< " x " << core_per_numa
<< " x " << thread_per_core
<< " ]"
<< std::endl ;
if ( detail ) {
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
#pragma omp parallel
{
#pragma omp critical
{
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
}
/* END #pragma omp critical */
}
/* END #pragma omp parallel */
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
s << " thread omp_rank[" << i << "]"
<< " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
<< std::endl ;
}
}
}
else {
s << " not initialized" << std::endl ;
}
}
} // namespace Kokkos
#endif //KOKKOS_HAVE_OPENMP

View File

@ -0,0 +1,767 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMPEXEC_HPP
#define KOKKOS_OPENMPEXEC_HPP
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp>
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
/** \brief Data for OpenMP thread execution */
class OpenMPexec {
public:
enum { MAX_THREAD_COUNT = 4096 };
struct Pool
{
Pool() : m_trackers() {}
AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
OpenMPexec * operator[](int i)
{
return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
}
AllocationTracker & at(int i)
{
return m_trackers[i];
}
};
private:
static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ];
static Pool m_pool; // Indexed by: m_pool_rank_rev
friend class Kokkos::OpenMP ;
int const m_pool_rank ;
int const m_pool_rank_rev ;
int const m_scratch_exec_end ;
int const m_scratch_reduce_end ;
int const m_scratch_thread_end ;
int volatile m_barrier_state ;
OpenMPexec();
OpenMPexec( const OpenMPexec & );
OpenMPexec & operator = ( const OpenMPexec & );
static void clear_scratch();
public:
// Topology of a cache coherent thread pool:
// TOTAL = NUMA x GRAIN
// pool_size( depth = 0 )
// pool_size(0) = total number of threads
// pool_size(1) = number of threads per NUMA
// pool_size(2) = number of threads sharing finest grain memory hierarchy
inline static
int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
inline static
OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
inline int pool_rank() const { return m_pool_rank ; }
inline int pool_rank_rev() const { return m_pool_rank_rev ; }
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
inline
void state_wait( int state )
{ Impl::spinwait( m_barrier_state , state ); }
inline
void state_set( int state ) { m_barrier_state = state ; }
~OpenMPexec() {}
OpenMPexec( const int poolRank
, const int scratch_exec_size
, const int scratch_reduce_size
, const int scratch_thread_size )
: m_pool_rank( poolRank )
, m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
, m_scratch_exec_end( scratch_exec_size )
, m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size )
, m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
, m_barrier_state(0)
{}
static void finalize();
static void initialize( const unsigned team_count ,
const unsigned threads_per_team ,
const unsigned numa_count ,
const unsigned cores_per_numa );
static void verify_is_process( const char * const );
static void verify_initialized( const char * const );
static void resize_scratch( size_t reduce_size , size_t thread_size );
inline static
OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class OpenMPexecTeamMember {
private:
enum { TEAM_REDUCE_SIZE = 512 };
/** \brief Thread states for team synchronization */
enum { Active = 0 , Rendezvous = 1 };
typedef Kokkos::OpenMP execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
Impl::OpenMPexec & m_exec ;
scratch_memory_space m_team_shared ;
int m_team_shmem ;
int m_team_base_rev ;
int m_team_rank_rev ;
int m_team_rank ;
int m_team_size ;
int m_league_rank ;
int m_league_end ;
int m_league_size ;
// Fan-in team threads, root of the fan-in which does not block returns true
inline
bool team_fan_in() const
{
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
}
if ( m_team_rank_rev ) {
m_exec.state_set( Rendezvous );
m_exec.state_wait( Rendezvous );
}
return 0 == m_team_rank_rev ;
}
inline
void team_fan_out() const
{
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
}
}
public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{}
#else
{
if ( 1 < m_team_size ) {
team_fan_in();
team_fan_out();
}
}
#endif
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType& value, const int& thread_id) const
{
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ }
#else
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
, ValueType , void >::type type ;
type * const local_value = ((type*) m_exec.scratch_thread());
if(team_rank() == thread_id)
*local_value = value;
memory_fence();
team_barrier();
value = *local_value;
#endif
}
#ifdef KOKKOS_HAVE_CXX11
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION ValueType
team_reduce( const ValueType & value
, const JoinOp & op_in ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ValueType(); }
#else
{
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
#endif
#else // KOKKOS_HAVE_CXX11
template< class JoinOp >
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return typename JoinOp::value_type(); }
#else
{
typedef typename JoinOp::value_type value_type;
#endif
#endif // KOKKOS_HAVE_CXX11
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
type * const local_value = ((type*) m_exec.scratch_thread());
// Set this thread's contribution
*local_value = value ;
// Fence to make sure the base team member has access:
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
type * const team_value = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
// Join to the team value:
for ( int i = 1 ; i < m_team_size ; ++i ) {
op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
}
// The base team member may "lap" the other team members,
// copy to their local value before proceeding.
for ( int i = 1 ; i < m_team_size ; ++i ) {
*((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
}
// Fence to make sure all team members have access
memory_fence();
}
team_fan_out();
return *((type volatile const *)local_value);
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename ArgType >
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ArgType(); }
#else
{
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
volatile type * const work_value = ((type*) m_exec.scratch_thread());
*work_value = value ;
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
// m_team_base[0] == highest ranking team member
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
//
// 1) copy from lower to higher rank, initialize lowest rank to zero
// 2) prefix sum from lowest to highest rank, skipping lowest rank
type accum = 0 ;
if ( global_accum ) {
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
accum += val ;
}
accum = atomic_fetch_add( global_accum , accum );
}
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
const type offset = accum ;
accum += val ;
val = offset ;
}
memory_fence();
}
team_fan_out();
return *work_value ;
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
{ return this-> template team_scan<Type>( value , 0 ); }
//----------------------------------------
// Private for the driver
private:
typedef execution_space::scratch_memory_space space ;
public:
template< class Arg0 , class Arg1 >
inline
OpenMPexecTeamMember( Impl::OpenMPexec & exec
, const TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > & team
, const int shmem_size
)
: m_exec( exec )
, m_team_shared(0,0)
, m_team_shmem( shmem_size )
, m_team_base_rev(0)
, m_team_rank_rev(0)
, m_team_rank(0)
, m_team_size( team.team_size() )
, m_league_rank(0)
, m_league_end(0)
, m_league_size( team.league_size() )
{
const int pool_rank_rev = m_exec.pool_rank_rev();
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
const int league_iter_end = team.league_size() - pool_league_rank_rev * team.team_iter();
if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) {
m_team_base_rev = team.team_alloc() * pool_league_rank_rev ;
m_team_rank_rev = pool_team_rank_rev ;
m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
m_league_end = league_iter_end ;
m_league_rank = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ;
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
}
}
bool valid() const
{ return m_league_rank < m_league_end ; }
void next()
{
if ( ++m_league_rank < m_league_end ) {
team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
}
}
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
};
} // namespace Impl
template< class Arg0 , class Arg1 >
class TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
{
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicy execution_policy ;
//! Execution space of this execution policy.
typedef Kokkos::OpenMP execution_space ;
typedef typename
Impl::if_c< ! Impl::is_same< Kokkos::OpenMP , Arg0 >::value , Arg0 , Arg1 >::type
work_tag ;
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return execution_space::thread_pool_size(1); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & )
{ return execution_space::thread_pool_size(2); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType &, const int& )
{ return execution_space::thread_pool_size(2); }
//----------------------------------------
private:
int m_league_size ;
int m_team_size ;
int m_team_alloc ;
int m_team_iter ;
inline void init( const int league_size_request
, const int team_size_request )
{
const int pool_size = execution_space::thread_pool_size(0);
const int team_max = execution_space::thread_pool_size(1);
const int team_grain = execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
m_team_size = team_size_request < team_max ?
team_size_request : team_max ;
// Round team size up to a multiple of 'team_gain'
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
const int team_count = pool_size / team_size_grain ;
// Constraint : pool_size = m_team_alloc * team_count
m_team_alloc = pool_size / team_count ;
// Maxumum number of iterations each team will take:
m_team_iter = ( m_league_size + team_count - 1 ) / team_count ;
}
public:
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
inline int team_alloc() const { return m_team_alloc ; }
inline int team_iter() const { return m_team_iter ; }
typedef Impl::OpenMPexecTeamMember member_type ;
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
inline
int OpenMP::thread_pool_size( int depth )
{
return Impl::OpenMPexec::pool_size(depth);
}
KOKKOS_INLINE_FUNCTION
int OpenMP::thread_pool_rank()
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
#else
return -1 ;
#endif
}
} // namespace Kokkos
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& begin, const iType& end) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,join);
}
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
}
}
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */

View File

@ -0,0 +1,484 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_QTHREAD )
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <utility>
#include <Kokkos_Qthread.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Error.hpp>
// Defines to enable experimental Qthread functionality
#define QTHREAD_LOCAL_PRIORITY
#define CLONED_TASKS
#include <qthread/qthread.h>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
/** s_exec is indexed by the reverse rank of the workers
* for faster fan-in / fan-out lookups
* [ n - 1 , n - 2 , ... , 0 ]
*/
QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
int s_number_shepherds = 0 ;
int s_number_workers_per_shepherd = 0 ;
int s_number_workers = 0 ;
inline
QthreadExec ** worker_exec()
{
return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
}
const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
int s_worker_reduce_end = 0 ; /* End of worker reduction memory */
int s_worker_shared_end = 0 ; /* Total of worker scratch memory */
int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
QthreadExecFunctionPointer volatile s_active_function = 0 ;
const void * volatile s_active_function_arg = 0 ;
} /* namespace */
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
void Qthread::initialize( int thread_count )
{
// Environment variable: QTHREAD_NUM_SHEPHERDS
// Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
// Environment variable: QTHREAD_HWPAR
{
char buffer[256];
snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
putenv(buffer);
}
const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
( thread_count == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
( thread_count == qthread_num_workers() );
bool ok_symmetry = true ;
if ( ok_init ) {
Impl::s_number_shepherds = qthread_num_shepherds();
Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
}
}
if ( ! ok_init || ! ok_symmetry ) {
std::ostringstream msg ;
msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
msg << " : qthread_num_workers = " << qthread_num_workers();
if ( ! ok_symmetry ) {
msg << " : qthread_num_workers_local = {" ;
for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
msg << " " << qthread_num_workers_local(i) ;
}
msg << " }" ;
}
Impl::s_number_workers = 0 ;
Impl::s_number_shepherds = 0 ;
Impl::s_number_workers_per_shepherd = 0 ;
if ( ok_init ) { qthread_finalize(); }
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
}
void Qthread::finalize()
{
Impl::QthreadExec::clear_workers();
if ( Impl::s_number_workers ) {
qthread_finalize();
}
Impl::s_number_workers = 0 ;
Impl::s_number_shepherds = 0 ;
Impl::s_number_workers_per_shepherd = 0 ;
}
void Qthread::print_configuration( std::ostream & s , const bool detail )
{
s << "Kokkos::Qthread {"
<< " num_shepherds(" << Impl::s_number_shepherds << ")"
<< " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
<< " }" << std::endl ;
}
Qthread & Qthread::instance( int )
{
static Qthread q ;
return q ;
}
void Qthread::fence()
{
}
int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
aligned_t driver_exec_all( void * arg )
{
QthreadExec & exec = **worker_exec();
(*s_active_function)( exec , s_active_function_arg );
/*
fprintf( stdout
, "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, exec.worker_rank()
, exec.worker_size()
, exec.shepherd_rank()
, exec.shepherd_size()
, exec.shepherd_worker_rank()
, exec.shepherd_worker_size()
);
fflush(stdout);
*/
return 0 ;
}
aligned_t driver_resize_worker_scratch( void * arg )
{
static volatile int lock_begin = 0 ;
static volatile int lock_end = 0 ;
QthreadExec ** const exec = worker_exec();
//----------------------------------------
// Serialize allocation for thread safety
while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
const bool ok = 0 == *exec ;
if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
lock_begin = 0 ; // release lock
if ( ok ) { new( *exec ) QthreadExec(); }
//----------------------------------------
// Wait for all calls to complete to insure that each worker has executed.
if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
while ( lock_end );
/*
fprintf( stdout
, "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, (**exec).worker_rank()
, (**exec).worker_size()
, (**exec).shepherd_rank()
, (**exec).shepherd_size()
, (**exec).shepherd_worker_rank()
, (**exec).shepherd_worker_size()
);
fflush(stdout);
*/
//----------------------------------------
if ( ! ok ) {
fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
fflush( stderr );
}
return 0 ;
}
void verify_is_process( const char * const label , bool not_active = false )
{
const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
const bool is_active = not_active && ( s_active_function || s_active_function_arg );
if ( not_process || is_active ) {
std::string msg( label );
msg.append( " : FAILED" );
if ( not_process ) msg.append(" : not called by main process");
if ( is_active ) msg.append(" : parallel execution in progress");
Kokkos::Impl::throw_runtime_exception( msg );
}
}
}
int QthreadExec::worker_per_shepherd()
{
return s_number_workers_per_shepherd ;
}
QthreadExec::QthreadExec()
{
const int shepherd_rank = qthread_shep();
const int shepherd_worker_rank = qthread_worker_local(NULL);
const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
m_worker_base = s_exec ;
m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ;
m_reduce_end = s_worker_reduce_end ;
m_shepherd_rank = shepherd_rank ;
m_shepherd_size = s_number_shepherds ;
m_shepherd_worker_rank = shepherd_worker_rank ;
m_shepherd_worker_size = s_number_workers_per_shepherd ;
m_worker_rank = worker_rank ;
m_worker_size = s_number_workers ;
m_worker_state = QthreadExec::Active ;
}
void QthreadExec::clear_workers()
{
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
QthreadExec * const exec = s_exec[iwork] ;
s_exec[iwork] = 0 ;
free( exec );
}
}
void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
{
new( & space )
Qthread::scratch_memory_space(
((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
s_worker_shared_end - s_worker_shared_begin
);
}
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
{
const int exec_all_reduce_alloc = align_alloc( reduce_size );
const int shepherd_scan_alloc = align_alloc( 8 );
const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
if ( s_worker_reduce_end < exec_all_reduce_alloc ||
s_worker_shared_end < shepherd_shared_end ) {
/*
fprintf( stdout , "QthreadExec::resize\n");
fflush(stdout);
*/
// Clear current worker memory before allocating new worker memory
clear_workers();
// Increase the buffers to an aligned allocation
s_worker_reduce_end = exec_all_reduce_alloc ;
s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
s_worker_shared_end = shepherd_shared_end ;
// Need to query which shepherd this main 'process' is running...
const int main_shep = qthread_shep();
// Have each worker resize its memory for proper first-touch
#if 0
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
}}
#else
// If this function is used before the 'qthread.task_policy' unit test
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_resize_worker_scratch /* function */
, NULL /* function data block */
, NULL /* pointer to return value feb */
, jshep /* shepherd number */
, num_clone - 1 /* number of instances - 1 */
);
assert(ret == QTHREAD_SUCCESS);
}
}
#endif
driver_resize_worker_scratch( NULL );
// Verify all workers allocated
bool ok = true ;
for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
if ( ! ok ) {
std::ostringstream msg ;
msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
}
msg << " }" ;
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
}
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
{
verify_is_process("QthreadExec::exec_all(...)",true);
/*
fprintf( stdout , "QthreadExec::exec_all\n");
fflush(stdout);
*/
s_active_function = func ;
s_active_function_arg = arg ;
// Need to query which shepherd this main 'process' is running...
const int main_shep = qthread_shep();
#if 0
for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
}}
#else
// If this function is used before the 'qthread.task_policy' unit test
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_exec_all /* function */
, NULL /* function data block */
, NULL /* pointer to return value feb */
, jshep /* shepherd number */
, num_clone - 1 /* number of instances - 1 */
);
assert(ret == QTHREAD_SUCCESS);
}
}
#endif
driver_exec_all( NULL );
s_active_function = 0 ;
s_active_function_arg = 0 ;
}
void * QthreadExec::exec_all_reduce_result()
{
return s_exec[0]->m_scratch_alloc ;
}
} /* namespace Impl */
} /* namespace Kokkos */
namespace Kokkos {
namespace Impl {
QthreadTeamPolicyMember::QthreadTeamPolicyMember()
: m_exec( **worker_exec() )
, m_team_shared(0,0)
, m_team_size( 1 ) // s_number_workers_per_shepherd )
, m_team_rank( 0 ) // m_exec.shepherd_worker_rank() )
, m_league_size(1)
, m_league_end(1)
, m_league_rank(0)
{
m_exec.shared_reset( m_team_shared );
}
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */

View File

@ -0,0 +1,614 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_QTHREADEXEC_HPP
#define KOKKOS_QTHREADEXEC_HPP
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
class QthreadExec ;
typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
class QthreadExec {
private:
enum { Inactive = 0 , Active = 1 };
const QthreadExec * const * m_worker_base ;
const QthreadExec * const * m_shepherd_base ;
void * m_scratch_alloc ; ///< Scratch memory [ reduce , team , shared ]
int m_reduce_end ; ///< End of scratch reduction memory
int m_shepherd_rank ;
int m_shepherd_size ;
int m_shepherd_worker_rank ;
int m_shepherd_worker_size ;
/*
* m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
* m_worker_size = m_shepherd_size * m_shepherd_worker_size
*/
int m_worker_rank ;
int m_worker_size ;
int mutable volatile m_worker_state ;
friend class Kokkos::Qthread ;
~QthreadExec();
QthreadExec( const QthreadExec & );
QthreadExec & operator = ( const QthreadExec & );
public:
QthreadExec();
/** Execute the input function on all available Qthread workers */
static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
//----------------------------------------
/** Barrier across all workers participating in the 'exec_all' */
void exec_all_barrier() const
{
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
/** Barrier across workers within the shepherd with rank < team_rank */
void shepherd_barrier( const int team_size ) const
{
if ( m_shepherd_worker_rank < team_size ) {
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
}
}
//----------------------------------------
/** Reduce across all workers participating in the 'exec_all' */
template< class FunctorType , class ArgTag >
inline
void exec_all_reduce( const FunctorType & func ) const
{
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
const QthreadExec & fan = *m_worker_base[j];
Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
//----------------------------------------
/** Scall across all workers participating in the 'exec_all' */
template< class FunctorType , class ArgTag >
inline
void exec_all_scan( const FunctorType & func ) const
{
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , ArgTag > ValueOps ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_worker_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
for ( int i = 1 ; i < m_worker_size ; ++i ) {
ValueOps::copy( func
, m_worker_base[i-1]->m_scratch_alloc
, m_worker_base[i]->m_scratch_alloc
);
}
ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
// Join from lower ranking to higher ranking worker.
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
for ( int i = m_worker_size - 1 ; --i ; ) {
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
}
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
//----------------------------------------
template< class Type>
inline
volatile Type * shepherd_team_scratch_value() const
{ return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
template< class Type >
inline
void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
{
if ( m_shepherd_base ) {
Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
memory_fence();
shepherd_barrier( team_size );
value = *shared_value ;
}
}
template< class Type >
inline
Type shepherd_reduce( const int team_size , const Type & value ) const
{
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < n ; ++i ) {
accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
for ( int i = 1 ; i < n ; ++i ) {
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
template< class JoinOp >
inline
typename JoinOp::value_type
shepherd_reduce( const int team_size
, const typename JoinOp::value_type & value
, const JoinOp & op ) const
{
typedef typename JoinOp::value_type Type ;
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < team_size ; ++i ) {
op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
}
for ( int i = 1 ; i < team_size ; ++i ) {
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
template< class Type >
inline
Type shepherd_scan( const int team_size
, const Type & value
, Type * const global_value = 0 ) const
{
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_shepherd_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < team_size ; ++i ) {
const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
accum += tmp ;
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
}
* m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
// Join from lower ranking to higher ranking worker.
for ( int i = team_size ; --i ; ) {
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
//----------------------------------------
static inline
int align_alloc( int size )
{
enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
}
void shared_reset( Qthread::scratch_memory_space & );
void * exec_all_reduce_value() const { return m_scratch_alloc ; }
static void * exec_all_reduce_result();
static void resize_worker_scratch( const int reduce_size , const int shared_size );
static void clear_workers();
//----------------------------------------
inline int worker_rank() const { return m_worker_rank ; }
inline int worker_size() const { return m_worker_size ; }
inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
inline int shepherd_rank() const { return m_shepherd_rank ; }
inline int shepherd_size() const { return m_shepherd_size ; }
static int worker_per_shepherd();
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class QthreadTeamPolicyMember {
private:
typedef Kokkos::Qthread execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
Impl::QthreadExec & m_exec ;
scratch_memory_space m_team_shared ;
const int m_team_size ;
const int m_team_rank ;
const int m_league_size ;
const int m_league_end ;
int m_league_rank ;
public:
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_shmem() const { return m_team_shared ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{}
#else
{ m_exec.shepherd_barrier( m_team_size ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
#endif
template< typename JoinOp >
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return typename JoinOp::value_type(); }
#else
{ return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
#endif
//----------------------------------------
// Private driver for task-team parallel
QthreadTeamPolicyMember();
//----------------------------------------
// Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
// Initialize
template< class Arg0 , class Arg1 >
QthreadTeamPolicyMember( Impl::QthreadExec & exec , const TeamPolicy<Arg0,Arg1,Qthread> & team )
: m_exec( exec )
, m_team_shared(0,0)
, m_team_size( team.m_team_size )
, m_team_rank( exec.shepherd_worker_rank() )
, m_league_size( team.m_league_size )
, m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
, m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
{
m_exec.shared_reset( m_team_shared );
}
// Continue
operator bool () const { return m_league_rank < m_league_end ; }
// iterate
void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
};
} // namespace Impl
template< class Arg0 , class Arg1 >
class TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >
{
private:
const int m_league_size ;
const int m_team_size ;
const int m_shepherd_iter ;
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicy execution_policy ;
typedef Qthread execution_space ;
typedef typename
Impl::if_c< ! Impl::is_same< Kokkos::Qthread , Arg0 >::value , Arg0 , Arg1 >::type
work_tag ;
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return Qthread::instance().shepherd_worker_size(); }
template< class FunctorType >
static int team_size_recommended( const FunctorType & f )
{ return team_size_max( f ); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & f , const int& )
{ return team_size_max( f ); }
//----------------------------------------
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
// One active team per shepherd
TeamPolicy( Kokkos::Qthread & q
, const int league_size
, const int team_size
)
: m_league_size( league_size )
, m_team_size( team_size < q.shepherd_worker_size()
? team_size : q.shepherd_worker_size() )
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
{
}
// One active team per shepherd
TeamPolicy( const int league_size
, const int team_size
)
: m_league_size( league_size )
, m_team_size( team_size < Qthread::instance().shepherd_worker_size()
? team_size : Qthread::instance().shepherd_worker_size() )
, m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
{
}
typedef Impl::QthreadTeamPolicyMember member_type ;
friend class Impl::QthreadTeamPolicyMember ;
};
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_QTHREADEXEC_HPP */

View File

@ -0,0 +1,643 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
#define KOKKOS_QTHREAD_PARALLEL_HPP
#include <vector>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <Qthread/Kokkos_QthreadExec.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
}
}
// Function is called once by every concurrent thread.
static void execute( QthreadExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
// All threads wait for completion.
exec.exec_all_barrier();
}
public:
ParallelFor( const FunctorType & functor
, const Policy & policy
)
: m_func( functor )
, m_policy( policy )
{
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
}
};
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
}
}
static void execute( QthreadExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
driver( self.m_func
, ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
, typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
);
exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
}
public:
template< class HostViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const HostViewType & result_view )
: m_func( functor )
, m_policy( policy )
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
if ( result_view.ptr_on_device() ) {
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
}
}
};
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
const FunctorType m_func ;
const Policy m_team ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( member ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( TagType() , member ); }
static void execute( QthreadExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
typename Policy::member_type member( exec , self.m_team );
while ( member ) {
self.ParallelFor::template driver< typename Policy::work_tag >( member );
member.team_barrier();
member.next_team();
}
exec.exec_all_barrier();
}
public:
ParallelFor( const FunctorType & functor ,
const Policy & policy )
: m_func( functor )
, m_team( policy )
{
QthreadExec::resize_worker_scratch
( /* reduction memory */ 0
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
}
};
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_team ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( member , update ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( TagType() , member , update ); }
static void execute( QthreadExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
typename Policy::member_type member( exec , self.m_team );
while ( member ) {
self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
member.team_barrier();
member.next_team();
}
exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
}
public:
template< class ViewType >
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const ViewType & result )
: m_func( functor )
, m_team( policy )
{
QthreadExec::resize_worker_scratch
( /* reduction memory */ ValueTraits::value_size( functor )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , final );
}
}
static void execute( QthreadExec & exec , const void * arg )
{
const ParallelScan & self = * ((const ParallelScan *) arg );
const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
driver( self.m_func , update , false , range );
exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
driver( self.m_func , update , true , range );
exec.exec_all_barrier();
}
public:
ParallelScan( const FunctorType & functor
, const Policy & policy
)
: m_func( functor )
, m_policy( policy )
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
TeamThreadRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count)
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread
, const iType & begin
, const iType & end
)
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
#if defined( KOKKOS_HAVE_CXX11 )
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
}
#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
} // namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
}
} // namespace Kokkos
#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */

View File

@ -0,0 +1,451 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_QTHREAD )
#include <stdio.h>
#include <stdlib.h>
#include <stdexcept>
#include <iostream>
#include <sstream>
#include <string>
#include <Kokkos_Atomic.hpp>
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
typedef TaskMember< Kokkos::Qthread , void , void > Task ;
namespace {
inline
unsigned padded_sizeof_derived( unsigned sizeof_derived )
{
return sizeof_derived +
( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
}
// int lock_alloc_dealloc = 0 ;
} // namespace
void Task::deallocate( void * ptr )
{
// Counting on 'free' thread safety so lock/unlock not required.
// However, isolate calls here to mitigate future need to introduce lock/unlock.
// lock
// while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
free( ptr );
// unlock
// Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
}
void * Task::allocate( const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity )
{
// Counting on 'malloc' thread safety so lock/unlock not required.
// However, isolate calls here to mitigate future need to introduce lock/unlock.
// lock
// while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
// unlock
// Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
return ptr ;
}
Task::~TaskMember()
{
}
Task::TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( arg_verify )
, m_apply_single( arg_apply_single )
, m_apply_team( arg_apply_team )
, m_active_count( & arg_active_count )
, m_qfeb(0)
, m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
, m_dep_capacity( arg_dependence_capacity )
, m_dep_size( 0 )
, m_ref_count( 0 )
, m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
{
qthread_empty( & m_qfeb ); // Set to full when complete
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
Task::TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( & Task::verify_type<void> )
, m_apply_single( arg_apply_single )
, m_apply_team( arg_apply_team )
, m_active_count( & arg_active_count )
, m_qfeb(0)
, m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
, m_dep_capacity( arg_dependence_capacity )
, m_dep_size( 0 )
, m_ref_count( 0 )
, m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
{
qthread_empty( & m_qfeb ); // Set to full when complete
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
//----------------------------------------------------------------------------
void Task::throw_error_add_dependence() const
{
std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
<< " state(" << m_state << ")"
<< " dep_size(" << m_dep_size << ")"
<< std::endl ;
throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
}
void Task::throw_error_verify_type()
{
throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
}
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
{
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
static const char msg_error_count[] = ": negative reference count" ;
static const char msg_error_complete[] = ": destroy task that is not complete" ;
static const char msg_error_dependences[] = ": destroy task that has dependences" ;
static const char msg_error_exception[] = ": caught internal exception" ;
if ( rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); }
Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs );
if ( lhs_val ) {
const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 );
const char * msg_error = 0 ;
try {
if ( 1 == count ) {
// Reference count at zero, delete it
// Should only be deallocating a completed task
if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
// A completed task should not have dependences...
for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) {
if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ;
}
}
else {
msg_error = msg_error_complete ;
}
if ( 0 == msg_error ) {
// Get deletion function and apply it
const Task::function_dealloc_type d = (*lhs_val).m_dealloc ;
(*d)( lhs_val );
}
}
else if ( count <= 0 ) {
msg_error = msg_error_count ;
}
}
catch( ... ) {
if ( 0 == msg_error ) msg_error = msg_error_exception ;
}
if ( 0 != msg_error ) {
if ( no_throw ) {
std::cerr << msg_error_header << msg_error << std::endl ;
std::cerr.flush();
}
else {
std::string msg(msg_error_header);
msg.append(msg_error);
throw std::runtime_error( msg );
}
}
}
}
#endif
//----------------------------------------------------------------------------
aligned_t Task::qthread_func( void * arg )
{
Task * const task = reinterpret_cast< Task * >(arg);
// First member of the team change state to executing.
// Use compare-exchange to avoid race condition with a respawn.
Kokkos::atomic_compare_exchange_strong( & task->m_state
, int(Kokkos::Experimental::TASK_STATE_WAITING)
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
);
// It is a single thread's responsibility to close out
// this task's execution.
bool close_out = false ;
if ( task->m_apply_team ) {
Kokkos::Impl::QthreadTeamPolicyMember member ;
(*task->m_apply_team)( task , member );
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(task)
, member.team_rank()
, member.team_size()
);
fflush(stdout);
member.team_barrier();
close_out = member.team_rank() == 0 ;
}
else {
(*task->m_apply_single)( task );
close_out = true ;
}
if ( close_out ) {
// When dependent tasks run there would be a race
// condition between destroying this task and
// querying the active count pointer from this task.
int volatile * active_count = task->m_active_count ;
if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) {
#if 0
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx respawn\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(task)
);
fflush(stdout);
#endif
// Task respawned, set state to waiting and reschedule the task
task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
task->schedule();
}
else {
// Task did not respawn, is complete
task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
// Release dependences before allowing dependent tasks to run.
// Otherwise there is a thread race condition for removing dependences.
for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
assign( & task->m_dep[i] , 0 );
}
// Set qthread FEB to full so that dependent tasks are allowed to execute.
// This 'task' may be deleted immediately following this function call.
qthread_fill( & task->m_qfeb );
}
// Decrement active task count before returning.
Kokkos::atomic_decrement( active_count );
}
#if 0
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx return\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(task)
);
fflush(stdout);
#endif
return 0 ;
}
void Task::respawn()
{
// Change state from pure executing to ( waiting | executing )
// to avoid confusion with simply waiting.
Kokkos::atomic_compare_exchange_strong( & m_state
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
, int(Kokkos::Experimental::TASK_STATE_WAITING |
Kokkos::Experimental::TASK_STATE_EXECUTING)
);
}
void Task::schedule()
{
// Is waiting for execution
// Increment active task count before spawning.
Kokkos::atomic_increment( m_active_count );
// spawn in qthread. must malloc the precondition array and give to qthread.
// qthread will eventually free this allocation so memory will not be leaked.
// concern with thread safety of malloc, does this need to be guarded?
aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
for ( int i = 0 ; i < m_dep_size ; ++i ) {
qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
}
if ( m_apply_single ) {
qthread_spawn( & Task::qthread_func /* function */
, this /* function argument */
, 0
, NULL
, m_dep_size , qprecon /* dependences */
, NO_SHEPHERD
, QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
);
}
else {
// If more than one shepherd spawn on a shepherd other than this shepherd
const int num_shepherd = qthread_num_shepherds();
const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
const int this_shepherd = qthread_shep();
int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(this)
, spawn_shepherd
, num_worker_per_shepherd - 1
);
fflush(stdout);
qthread_spawn_cloneable
( & Task::qthread_func
, this
, 0
, NULL
, m_dep_size , qprecon /* dependences */
, spawn_shepherd
// , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
, unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY )
, num_worker_per_shepherd - 1
);
}
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
TaskPolicy< Kokkos::Qthread >::member_type &
TaskPolicy< Kokkos::Qthread >::member_single()
{
static member_type s ;
return s ;
}
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
{
volatile int * const active_task_count = & policy.m_active_count ;
while ( *active_task_count ) qthread_yield();
}
} // namespace Experimental
} // namespace Kokkos
#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */

View File

@ -0,0 +1,646 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
#define KOKKOS_QTHREAD_TASKPOLICY_HPP
#include <string>
#include <typeinfo>
#include <stdexcept>
//----------------------------------------------------------------------------
// Defines to enable experimental Qthread functionality
#define QTHREAD_LOCAL_PRIORITY
#define CLONED_TASKS
#include <qthread.h>
#undef QTHREAD_LOCAL_PRIORITY
#undef CLONED_TASKS
//----------------------------------------------------------------------------
#include <Kokkos_Qthread.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_View.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
class TaskMember< Kokkos::Qthread , void , void >
{
public:
typedef void (* function_apply_single_type) ( TaskMember * );
typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
typedef TaskMember * (* function_verify_type) ( TaskMember * );
private:
const function_dealloc_type m_dealloc ; ///< Deallocation
const function_verify_type m_verify ; ///< Result type verification
const function_apply_single_type m_apply_single ; ///< Apply function
const function_apply_team_type m_apply_team ; ///< Apply function
int volatile * const m_active_count ; ///< Count of active tasks on this policy
aligned_t m_qfeb ; ///< Qthread full/empty bit
TaskMember ** const m_dep ; ///< Dependences
const int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
TaskMember() /* = delete */ ;
TaskMember( const TaskMember & ) /* = delete */ ;
TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
static aligned_t qthread_func( void * arg );
static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
static void deallocate( void * );
void throw_error_add_dependence() const ;
static void throw_error_verify_type();
template < class DerivedTaskType >
static
void deallocate( TaskMember * t )
{
DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
ptr->~DerivedTaskType();
deallocate( (void *) ptr );
}
void schedule();
protected :
~TaskMember();
// Used by TaskMember< Qthread , ResultType , void >
TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
// Used for TaskMember< Qthread , void , void >
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
public:
template< typename ResultType >
KOKKOS_FUNCTION static
TaskMember * verify_type( TaskMember * t )
{
enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
if ( check_type && t != 0 ) {
// Verify that t->m_verify is this function
const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
if ( t->m_verify != self ) {
t = 0 ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
throw_error_verify_type();
#endif
}
}
return t ;
}
//----------------------------------------
/* Inheritence Requirements on task types:
* typedef FunctorType::value_type value_type ;
* class DerivedTaskType
* : public TaskMember< Qthread , value_type , FunctorType >
* { ... };
* class TaskMember< Qthread , value_type , FunctorType >
* : public TaskMember< Qthread , value_type , void >
* , public Functor
* { ... };
* If value_type != void
* class TaskMember< Qthread , value_type , void >
* : public TaskMember< Qthread , void , void >
*
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
*
*/
/** \brief Allocate and construct a single-thread task */
template< class DerivedTaskType >
static
TaskMember * create_single( const typename DerivedTaskType::functor_type & arg_functor
, volatile int & arg_active_count
, const unsigned arg_dependence_capacity )
{
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ;
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
, & TaskMember::template apply_single< functor_type , value_type >
, 0
, arg_active_count
, sizeof(DerivedTaskType)
, arg_dependence_capacity
, arg_functor );
return static_cast< TaskMember * >( task );
}
/** \brief Allocate and construct a team-thread task */
template< class DerivedTaskType >
static
TaskMember * create_team( const typename DerivedTaskType::functor_type & arg_functor
, volatile int & arg_active_count
, const unsigned arg_dependence_capacity )
{
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ;
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
, 0
, & TaskMember::template apply_team< functor_type , value_type >
, arg_active_count
, sizeof(DerivedTaskType)
, arg_dependence_capacity
, arg_functor );
return static_cast< TaskMember * >( task );
}
void respawn();
void spawn()
{
m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
schedule();
}
//----------------------------------------
typedef FutureValueTypeIsVoidError get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return get_result_type() ; }
KOKKOS_INLINE_FUNCTION
Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
//----------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
static
void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
#else
KOKKOS_INLINE_FUNCTION static
void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
#endif
KOKKOS_INLINE_FUNCTION
TaskMember * get_dependence( int i ) const
{ return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
KOKKOS_INLINE_FUNCTION
int get_dependence() const
{ return m_dep_size ; }
KOKKOS_INLINE_FUNCTION
void clear_dependence()
{
for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
m_dep_size = 0 ;
}
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskMember * before )
{
if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
Kokkos::Experimental::TASK_STATE_EXECUTING == m_state ) &&
m_dep_size < m_dep_capacity ) {
assign( m_dep + m_dep_size , before );
++m_dep_size ;
}
else {
throw_error_add_dependence();
}
}
//----------------------------------------
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthread , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
}
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthread , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
}
//----------------------------------------
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
, Kokkos::Impl::QthreadTeamPolicyMember & member )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
derived_type & m = * static_cast< derived_type * >( t );
m.FunctorType::apply( member , m.m_result );
}
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
, Kokkos::Impl::QthreadTeamPolicyMember & member )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
derived_type & m = * static_cast< derived_type * >( t );
m.FunctorType::apply( member );
}
};
//----------------------------------------------------------------------------
/** \brief Base class for tasks with a result value in the Qthread execution space.
*
* The FunctorType must be void because this class is accessed by the
* Future class for the task and result value.
*
* Must be derived from TaskMember<S,void,void> 'root class' so the Future class
* can correctly static_cast from the 'root class' to this class.
*/
template < class ResultType >
class TaskMember< Kokkos::Qthread , ResultType , void >
: public TaskMember< Kokkos::Qthread , void , void >
{
public:
ResultType m_result ;
typedef const ResultType & get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return m_result ; }
protected:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: task_root_type( & task_root_type::template verify_type< ResultType >
, arg_dealloc
, arg_apply_single
, arg_apply_team
, arg_active_count
, arg_sizeof_derived
, arg_dependence_capacity )
, m_result()
{}
};
template< class ResultType , class FunctorType >
class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
: public TaskMember< Kokkos::Qthread , ResultType , void >
, public FunctorType
{
public:
typedef FunctorType functor_type ;
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single
, const function_apply_team_type arg_apply_team
, volatile int & arg_active_count
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
, const functor_type & arg_functor
)
: task_base_type( arg_dealloc
, arg_apply_single
, arg_apply_team
, arg_active_count
, arg_sizeof_derived
, arg_dependence_capacity )
, functor_type( arg_functor )
{}
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
void wait( TaskPolicy< Kokkos::Qthread > & );
template<>
class TaskPolicy< Kokkos::Qthread >
{
public:
typedef Kokkos::Qthread execution_space ;
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
private:
typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ;
template< class FunctorType >
static inline
const task_root_type * get_task_root( const FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
}
template< class FunctorType >
static inline
task_root_type * get_task_root( FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< task_root_type * >( static_cast< task_type * >(f) );
}
const unsigned m_default_dependence_capacity ;
volatile int m_active_count_root ;
volatile int & m_active_count ;
public:
KOKKOS_INLINE_FUNCTION
TaskPolicy()
: m_default_dependence_capacity(4)
, m_active_count_root(0)
, m_active_count( m_active_count_root )
{}
KOKKOS_INLINE_FUNCTION
explicit
TaskPolicy( const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity )
, m_active_count_root(0)
, m_active_count( m_active_count_root )
{}
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs )
: m_default_dependence_capacity( rhs.m_default_dependence_capacity )
, m_active_count_root(0)
, m_active_count( rhs.m_active_count )
{}
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs
, const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity )
, m_active_count_root(0)
, m_active_count( rhs.m_active_count )
{}
//----------------------------------------
template< class ValueType >
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
f.m_task->spawn();
#endif
return f ;
}
// Create single-thread task
template< class FunctorType >
Future< typename FunctorType::value_type , execution_space >
create( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create_single< task_type >
( functor
, m_active_count
, ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
)
#endif
);
}
// Create thread-team task
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create_team( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create_team< task_type >
( functor
, m_active_count
, ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
)
#endif
);
}
// Add dependence
template< class A1 , class A2 , class A3 , class A4 >
void add_dependence( const Future<A1,A2> & after
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
&&
Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
after.m_task->add_dependence( before.m_task );
#endif
}
//----------------------------------------
// Functions for an executing task functor to query dependences,
// set new dependences, and respawn itself.
template< class FunctorType >
Future< void , execution_space >
get_dependence( const FunctorType * task_functor , int i ) const
{
return Future<void,execution_space>(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
get_task_root(task_functor)->get_dependence(i)
#endif
);
}
template< class FunctorType >
int get_dependence( const FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return get_task_root(task_functor)->get_dependence(); }
#else
{ return 0 ; }
#endif
template< class FunctorType >
void clear_dependence( FunctorType * task_functor ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
get_task_root(task_functor)->clear_dependence();
#endif
}
template< class FunctorType , class A3 , class A4 >
void add_dependence( FunctorType * task_functor
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
get_task_root(task_functor)->add_dependence( before.m_task );
#endif
}
template< class FunctorType >
void respawn( FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->respawn(); }
#else
{}
#endif
static member_type & member_single();
friend void wait( TaskPolicy< Kokkos::Qthread > & );
};
} /* namespace Experimental */
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */

View File

@ -0,0 +1,28 @@
# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options.
# Cloning repository and branch:
git clone https://github.com/stelleg/qthreads qthreads-with-clone
cd qthreads-with-clone
# Added to ./git/config
#
# [branch "cloned_tasks"]
# remote = origin
# merge = refs/heads/cloned_tasks
#
git branch cloned_tasks
git checkout cloned_tasks
git pull
sh autogen.sh
# configurure with 'hwloc' installation:
./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}

View File

@ -0,0 +1,758 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
#include <stdint.h>
#include <limits>
#include <utility>
#include <iostream>
#include <sstream>
#include <Kokkos_Threads.hpp>
#include <Kokkos_hwloc.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Error.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
ThreadsExec s_threads_process ;
ThreadsExec * s_threads_exec[ ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
pthread_t s_threads_pid[ ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
int s_thread_pool_size[3] = { 0 , 0 , 0 };
unsigned s_current_reduce_size = 0 ;
unsigned s_current_shared_size = 0 ;
void (* volatile s_current_function)( ThreadsExec & , const void * );
const void * volatile s_current_function_arg = 0 ;
struct Sentinel {
Sentinel()
{
HostSpace::register_in_parallel( ThreadsExec::in_parallel );
}
~Sentinel()
{
if ( s_thread_pool_size[0] ||
s_thread_pool_size[1] ||
s_thread_pool_size[2] ||
s_current_reduce_size ||
s_current_shared_size ||
s_current_function ||
s_current_function_arg ||
s_threads_exec[0] ) {
std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
}
}
};
inline
unsigned fan_size( const unsigned rank , const unsigned size )
{
const unsigned rank_rev = size - ( rank + 1 );
unsigned count = 0 ;
for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
return count ;
}
} // namespace
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void execute_function_noop( ThreadsExec & , const void * ) {}
void ThreadsExec::driver(void)
{
ThreadsExec this_thread ;
while ( ThreadsExec::Active == this_thread.m_pool_state ) {
(*s_current_function)( this_thread , s_current_function_arg );
// Deactivate thread and wait for reactivation
this_thread.m_pool_state = ThreadsExec::Inactive ;
wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
}
}
ThreadsExec::ThreadsExec()
: m_pool_base(0)
, m_scratch()
, m_scratch_reduce_end(0)
, m_scratch_thread_end(0)
, m_numa_rank(0)
, m_numa_core_rank(0)
, m_pool_rank(0)
, m_pool_size(0)
, m_pool_fan_size(0)
, m_pool_state( ThreadsExec::Terminating )
{
if ( & s_threads_process != this ) {
// A spawned thread
ThreadsExec * const nil = 0 ;
// Which entry in 's_threads_exec', possibly determined from hwloc binding
const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
? ((size_t)s_current_function_arg)
: size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
// Given a good entry set this thread in the 's_threads_exec' array
if ( entry < s_thread_pool_size[0] &&
nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
m_numa_rank = coord.first ;
m_numa_core_rank = coord.second ;
m_pool_base = s_threads_exec ;
m_pool_rank = s_thread_pool_size[0] - ( entry + 1 );
m_pool_size = s_thread_pool_size[0] ;
m_pool_fan_size = fan_size( m_pool_rank , m_pool_size );
m_pool_state = ThreadsExec::Active ;
s_threads_pid[ m_pool_rank ] = pthread_self();
// Inform spawning process that the threads_exec entry has been set.
s_threads_process.m_pool_state = ThreadsExec::Active ;
}
else {
// Inform spawning process that the threads_exec entry could not be set.
s_threads_process.m_pool_state = ThreadsExec::Terminating ;
}
}
else {
// Enables 'parallel_for' to execute on unitialized Threads device
m_pool_rank = 0 ;
m_pool_size = 1 ;
m_pool_state = ThreadsExec::Inactive ;
s_threads_pid[ m_pool_rank ] = pthread_self();
}
}
ThreadsExec::~ThreadsExec()
{
const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
m_pool_base = 0 ;
m_scratch.clear();
m_scratch_reduce_end = 0 ;
m_scratch_thread_end = 0 ;
m_numa_rank = 0 ;
m_numa_core_rank = 0 ;
m_pool_rank = 0 ;
m_pool_size = 0 ;
m_pool_fan_size = 0 ;
m_pool_state = ThreadsExec::Terminating ;
if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
ThreadsExec * const nil = 0 ;
atomic_compare_exchange( s_threads_exec + entry , this , nil );
s_threads_process.m_pool_state = ThreadsExec::Terminating ;
}
}
int ThreadsExec::get_thread_count()
{
return s_thread_pool_size[0] ;
}
ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
{
ThreadsExec * const th =
init_thread_rank < s_thread_pool_size[0]
? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
std::ostringstream msg ;
msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
<< "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
if ( 0 == th ) {
msg << " does not exist" ;
}
else {
msg << " has wrong thread_rank " << th->m_pool_rank ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
return th ;
}
//----------------------------------------------------------------------------
void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
{
ThreadsExec::global_lock();
ThreadsExec::global_unlock();
const int n = exec.m_pool_fan_size ;
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
for ( int i = 0 ; i < n ; ++i ) {
Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
exec.m_pool_state = ThreadsExec::Inactive ;
}
}
}
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
{
if ( ! is_process() ) {
std::string msg( name );
msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
Kokkos::Impl::throw_runtime_exception( msg );
}
if ( initialized && 0 == s_thread_pool_size[0] ) {
std::string msg( name );
msg.append( " FAILED : Threads not initialized." );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
int ThreadsExec::in_parallel()
{
// A thread function is in execution and
// the function argument is not the special threads process argument and
// the master process is a worker or is not the master process.
return s_current_function &&
( & s_threads_process != s_current_function_arg ) &&
( s_threads_process.m_pool_base || ! is_process() );
}
// Wait for root thread to become inactive
void ThreadsExec::fence()
{
if ( s_thread_pool_size[0] ) {
// Wait for the root thread to complete:
Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
}
s_current_function = 0 ;
s_current_function_arg = 0 ;
}
/** \brief Begin execution of the asynchronous functor */
void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
{
verify_is_process("ThreadsExec::start" , true );
if ( s_current_function || s_current_function_arg ) {
Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
}
s_current_function = func ;
s_current_function_arg = arg ;
// Activate threads:
for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
}
if ( s_threads_process.m_pool_size ) {
// Master process is the root thread, run it:
(*func)( s_threads_process , arg );
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
}
}
//----------------------------------------------------------------------------
bool ThreadsExec::sleep()
{
verify_is_process("ThreadsExec::sleep", true );
if ( & execute_sleep == s_current_function ) return false ;
fence();
ThreadsExec::global_lock();
s_current_function = & execute_sleep ;
// Activate threads:
for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
}
return true ;
}
bool ThreadsExec::wake()
{
verify_is_process("ThreadsExec::wake", true );
if ( & execute_sleep != s_current_function ) return false ;
ThreadsExec::global_unlock();
if ( s_threads_process.m_pool_base ) {
execute_sleep( s_threads_process , 0 );
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
}
fence();
return true ;
}
//----------------------------------------------------------------------------
void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
{
s_current_function = func ;
s_current_function_arg = & s_threads_process ;
const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
ThreadsExec & th = * s_threads_exec[ --i ];
th.m_pool_state = ThreadsExec::Active ;
wait_yield( th.m_pool_state , ThreadsExec::Active );
}
if ( s_threads_process.m_pool_base ) {
s_threads_process.m_pool_state = ThreadsExec::Active ;
(*func)( s_threads_process , 0 );
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
}
s_current_function_arg = 0 ;
s_current_function = 0 ;
}
//----------------------------------------------------------------------------
void * ThreadsExec::root_reduce_scratch()
{
return s_threads_process.reduce_memory();
}
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
{
exec.m_scratch.clear();
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
if ( s_threads_process.m_scratch_thread_end ) {
exec.m_scratch =
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
// touch on this thread
while ( ptr < end ) *ptr++ = 0 ;
}
}
void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
{
enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
fence();
const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
// Increase size or deallocate completely.
if ( ( old_reduce_size < reduce_size ) ||
( old_thread_size < thread_size ) ||
( ( reduce_size == 0 && thread_size == 0 ) &&
( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
verify_is_process( "ThreadsExec::resize_scratch" , true );
s_threads_process.m_scratch_reduce_end = reduce_size ;
s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
execute_serial( & execute_resize_scratch );
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
}
return s_threads_process.m_scratch.alloc_ptr() ;
}
//----------------------------------------------------------------------------
void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
{
verify_is_process("ThreadsExec::print_configuration",false);
fence();
const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
// Forestall compiler warnings for unused variables.
(void) numa_count;
(void) cores_per_numa;
(void) threads_per_core;
s << "Kokkos::Threads" ;
#if defined( KOKKOS_HAVE_PTHREAD )
s << " KOKKOS_HAVE_PTHREAD" ;
#endif
#if defined( KOKKOS_HAVE_HWLOC )
s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
#endif
if ( s_thread_pool_size[0] ) {
s << " threads[" << s_thread_pool_size[0] << "]"
<< " threads_per_numa[" << s_thread_pool_size[1] << "]"
<< " threads_per_core[" << s_thread_pool_size[2] << "]"
;
if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
s << " ReduceScratch[" << s_current_reduce_size << "]"
<< " SharedScratch[" << s_current_shared_size << "]" ;
s << std::endl ;
if ( detail ) {
for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
ThreadsExec * const th = s_threads_exec[i] ;
if ( th ) {
const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
s << " Thread[ " << th->m_pool_rank << " : "
<< th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ;
s << " Fan{" ;
for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ;
s << " [ " << thfan->m_pool_rank << " : "
<< thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ;
}
s << " }" ;
if ( th == & s_threads_process ) {
s << " is_process" ;
}
}
s << std::endl ;
}
}
}
else {
s << " not initialized" << std::endl ;
}
}
//----------------------------------------------------------------------------
int ThreadsExec::is_initialized()
{ return 0 != s_threads_exec[0] ; }
void ThreadsExec::initialize( unsigned thread_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa ,
bool allow_asynchronous_threadpool )
{
static const Sentinel sentinel ;
const bool is_initialized = 0 != s_thread_pool_size[0] ;
unsigned thread_spawn_failed = 0 ;
for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
s_threads_exec[i] = NULL;
if ( ! is_initialized ) {
// If thread_count, use_numa_count, or use_cores_per_numa are zero
// then they will be given default values based upon hwloc detection
// and allowed asynchronous execution.
const bool hwloc_avail = hwloc::available();
if ( thread_count == 0 ) {
thread_count = hwloc_avail
? Kokkos::hwloc::get_available_numa_count() *
Kokkos::hwloc::get_available_cores_per_numa() *
Kokkos::hwloc::get_available_threads_per_core()
: 1 ;
}
const unsigned thread_spawn_begin =
hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
allow_asynchronous_threadpool ,
thread_count ,
use_numa_count ,
use_cores_per_numa ,
s_threads_coord );
const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
if ( thread_spawn_begin ) {
// Synchronous with s_threads_coord[0] as the process core
// Claim entry #0 for binding the process core.
s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
}
s_thread_pool_size[0] = thread_count ;
s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
s_current_function = & execute_function_noop ; // Initialization work function
for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
// If hwloc available then spawned thread will
// choose its own entry in 's_threads_coord'
// otherwise specify the entry.
s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
// Spawn thread executing the 'driver()' function.
// Wait until spawned thread has attempted to initialize.
// If spawning and initialization is successfull then
// an entry in 's_threads_exec' will be assigned.
if ( ThreadsExec::spawn() ) {
wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
}
if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
}
// Wait for all spawned threads to deactivate before zeroing the function.
for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
// Try to protect against cache coherency failure by casting to volatile.
ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
if ( th ) {
wait_yield( th->m_pool_state , ThreadsExec::Active );
}
else {
++thread_spawn_failed ;
}
}
s_current_function = 0 ;
s_current_function_arg = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
if ( ! thread_spawn_failed ) {
// Bind process to the core on which it was located before spawning occured
Kokkos::hwloc::bind_this_thread( proc_coord );
if ( thread_spawn_begin ) { // Include process in pool.
const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
s_threads_exec[0] = & s_threads_process ;
s_threads_process.m_numa_rank = coord.first ;
s_threads_process.m_numa_core_rank = coord.second ;
s_threads_process.m_pool_base = s_threads_exec ;
s_threads_process.m_pool_rank = thread_count - 1 ; // Reversed for scan-compatible reductions
s_threads_process.m_pool_size = thread_count ;
s_threads_process.m_pool_fan_size = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
}
else {
s_threads_process.m_pool_base = 0 ;
s_threads_process.m_pool_rank = 0 ;
s_threads_process.m_pool_size = 0 ;
s_threads_process.m_pool_fan_size = 0 ;
}
// Initial allocations:
ThreadsExec::resize_scratch( 1024 , 1024 );
}
else {
s_thread_pool_size[0] = 0 ;
s_thread_pool_size[1] = 0 ;
s_thread_pool_size[2] = 0 ;
}
}
if ( is_initialized || thread_spawn_failed ) {
std::ostringstream msg ;
msg << "Kokkos::Threads::initialize ERROR" ;
if ( is_initialized ) {
msg << " : already initialized" ;
}
if ( thread_spawn_failed ) {
msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
}
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
}
//----------------------------------------------------------------------------
void ThreadsExec::finalize()
{
verify_is_process("ThreadsExec::finalize",false);
fence();
resize_scratch(0,0);
const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
if ( s_threads_exec[i] ) {
s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
}
s_threads_pid[i] = 0 ;
}
if ( s_threads_process.m_pool_base ) {
( & s_threads_process )->~ThreadsExec();
s_threads_exec[0] = 0 ;
}
Kokkos::hwloc::unbind_this_thread();
s_thread_pool_size[0] = 0 ;
s_thread_pool_size[1] = 0 ;
s_thread_pool_size[2] = 0 ;
// Reset master thread to run solo.
s_threads_process.m_numa_rank = 0 ;
s_threads_process.m_numa_core_rank = 0 ;
s_threads_process.m_pool_base = 0 ;
s_threads_process.m_pool_rank = 0 ;
s_threads_process.m_pool_size = 1 ;
s_threads_process.m_pool_fan_size = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
}
//----------------------------------------------------------------------------
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
Threads & Threads::instance(int)
{
static Threads t ;
return t ;
}
int Threads::thread_pool_size( int depth )
{
return Impl::s_thread_pool_size[depth];
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
int Threads::thread_pool_rank()
{
const pthread_t pid = pthread_self();
int i = 0;
while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
return i ;
}
#endif
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */

View File

@ -0,0 +1,465 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADSEXEC_HPP
#define KOKKOS_THREADSEXEC_HPP
#include <stdio.h>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class ThreadsExec {
public:
// Fan array has log_2(NT) reduction threads plus 2 scan threads
// Currently limited to 16k threads.
enum { MAX_FAN_COUNT = 16 };
enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
enum { VECTOR_LENGTH = 8 };
/** \brief States of a worker thread */
enum { Terminating ///< Termination in progress
, Inactive ///< Exists, waiting for work
, Active ///< Exists, performing work
, Rendezvous ///< Exists, waiting in a barrier or reduce
, ScanCompleted
, ScanAvailable
, ReductionAvailable
};
private:
friend class Kokkos::Threads ;
// Fan-in operations' root is the highest ranking thread
// to place the 'scan' reduction intermediate values on
// the threads that need them.
// For a simple reduction the thread location is arbitrary.
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
Impl::AllocationTracker m_scratch ;
int m_scratch_reduce_end ;
int m_scratch_thread_end ;
int m_numa_rank ;
int m_numa_core_rank ;
int m_pool_rank ;
int m_pool_size ;
int m_pool_fan_size ;
int volatile m_pool_state ; ///< State for global synchronizations
static void global_lock();
static void global_unlock();
static bool spawn();
static void execute_resize_scratch( ThreadsExec & , const void * );
static void execute_sleep( ThreadsExec & , const void * );
ThreadsExec( const ThreadsExec & );
ThreadsExec & operator = ( const ThreadsExec & );
static void execute_serial( void (*)( ThreadsExec & , const void * ) );
public:
KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; }
KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; }
static int get_thread_count();
static ThreadsExec * get_thread( const int init_thread_rank );
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }
static void driver(void);
~ThreadsExec();
ThreadsExec();
static void * resize_scratch( size_t reduce_size , size_t thread_size );
static void * root_reduce_scratch();
static bool is_process();
static void verify_is_process( const std::string & , const bool initialized );
static int is_initialized();
static void initialize( unsigned thread_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa ,
bool allow_asynchronous_threadpool );
static void finalize();
/* Given a requested team size, return valid team size */
static unsigned team_size_valid( unsigned );
static void print_configuration( std::ostream & , const bool detail = false );
//------------------------------------
static void wait_yield( volatile int & , const int );
//------------------------------------
// All-thread functions:
inline
int all_reduce( const int value )
{
// Make sure there is enough scratch space:
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
*((volatile int*) reduce_memory()) = value ;
memory_fence();
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
int accum = 0 ;
for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
accum += *((volatile int *) get_thread( rank )->reduce_memory());
}
for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
*((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
}
memory_fence();
for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
get_thread( rank )->m_pool_state = ThreadsExec::Active ;
}
}
return *((volatile int*) reduce_memory());
}
//------------------------------------
// All-thread functions:
template< class FunctorType , class ArgTag >
inline
void fan_in_reduce( const FunctorType & f ) const
{
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
typedef Kokkos::Impl::FunctorFinal< FunctorType , ArgTag > Final ;
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , reduce_memory() , fan.reduce_memory() );
}
if ( ! rev_rank ) {
Final::final( f , reduce_memory() );
}
}
inline
void fan_in() const
{
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
}
}
template< class FunctorType , class ArgTag >
inline
void scan_large( const FunctorType & f )
{
// Sequence of states:
// 0) Active : entry and exit state
// 1) ReductionAvailable : reduction value available
// 2) ScanAvailable : inclusive scan value available
// 3) Rendezvous : All threads inclusive scan value are available
// 4) ScanCompleted : exclusive scan value copied
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ;
typedef typename Traits::value_type scalar_type ;
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
const unsigned count = Traits::value_count( f );
scalar_type * const work_value = (scalar_type *) reduce_memory();
//--------------------------------
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: Active -> ReductionAvailable (or ScanAvailable)
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , work_value , fan.reduce_memory() );
}
// Copy reduction value to scan value before releasing from this phase.
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
if ( rev_rank ) {
// Set: Active -> ReductionAvailable
m_pool_state = ThreadsExec::ReductionAvailable ;
// Wait for contributing threads' scan value to be available.
if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
// Wait: Active -> ReductionAvailable
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
}
// This thread has completed inclusive scan
// Set: ReductionAvailable -> ScanAvailable
m_pool_state = ThreadsExec::ScanAvailable ;
// Wait for all threads to complete inclusive scan
// Wait: ScanAvailable -> Rendezvous
Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
}
//--------------------------------
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
// Set: ScanAvailable -> Rendezvous
fan.m_pool_state = ThreadsExec::Rendezvous ;
}
// All threads have completed the inclusive scan.
// All non-root threads are in the Rendezvous state.
// Threads are free to overwrite their reduction value.
//--------------------------------
if ( ( rev_rank + 1 ) < m_pool_size ) {
// Exclusive scan: copy the previous thread's inclusive scan value
ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
}
else {
(void) Init::init( f , work_value );
}
//--------------------------------
// Wait for all threads to copy previous thread's inclusive scan value
// Wait for all threads: Rendezvous -> ScanCompleted
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
}
if ( rev_rank ) {
// Set: ScanAvailable -> ScanCompleted
m_pool_state = ThreadsExec::ScanCompleted ;
// Wait: ScanCompleted -> Active
Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
}
// Set: ScanCompleted -> Active
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
}
}
template< class FunctorType , class ArgTag >
inline
void scan_small( const FunctorType & f )
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > Init ;
typedef typename Traits::value_type scalar_type ;
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
const unsigned count = Traits::value_count( f );
scalar_type * const work_value = (scalar_type *) reduce_memory();
//--------------------------------
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the thread-scan before releasing threads
scalar_type * ptr_prev = 0 ;
for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
if ( rank ) {
for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
Join::join( f , ptr + count , ptr );
}
else {
(void) Init::init( f , ptr );
}
ptr_prev = ptr ;
}
}
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
}
}
//------------------------------------
/** \brief Wait for previous asynchronous functor to
* complete and release the Threads device.
* Acquire the Threads device and start this functor.
*/
static void start( void (*)( ThreadsExec & , const void * ) , const void * );
static int in_parallel();
static void fence();
static bool sleep();
static bool wake();
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
inline int Threads::in_parallel()
{ return Impl::ThreadsExec::in_parallel(); }
inline int Threads::is_initialized()
{ return Impl::ThreadsExec::is_initialized(); }
inline void Threads::initialize(
unsigned threads_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa ,
bool allow_asynchronous_threadpool )
{
Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
}
inline void Threads::finalize()
{
Impl::ThreadsExec::finalize();
}
inline void Threads::print_configuration( std::ostream & s , const bool detail )
{
Impl::ThreadsExec::print_configuration( s , detail );
}
inline bool Threads::sleep()
{ return Impl::ThreadsExec::sleep() ; }
inline bool Threads::wake()
{ return Impl::ThreadsExec::wake() ; }
inline void Threads::fence()
{ Impl::ThreadsExec::fence() ; }
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_THREADSEXEC_HPP */

View File

@ -0,0 +1,254 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_HAVE_PTHREAD )
/* Standard 'C' Linux libraries */
#include <pthread.h>
#include <sched.h>
#include <errno.h>
/* Standard C++ libaries */
#include <cstdlib>
#include <string>
#include <iostream>
#include <stdexcept>
#include <Kokkos_Threads.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
// Pthreads compatible driver.
// Recovery from an exception would require constant intra-thread health
// verification; which would negatively impact runtime. As such simply
// abort the process.
void * internal_pthread_driver( void * )
{
try {
ThreadsExec::driver();
}
catch( const std::exception & x ) {
std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
std::cerr.flush();
std::abort();
}
catch( ... ) {
std::cerr << "Exception thrown from worker thread" << std::endl ;
std::cerr.flush();
std::abort();
}
return NULL ;
}
} // namespace
//----------------------------------------------------------------------------
// Spawn a thread
bool ThreadsExec::spawn()
{
bool result = false ;
pthread_attr_t attr ;
if ( 0 == pthread_attr_init( & attr ) ||
0 == pthread_attr_setscope( & attr, PTHREAD_SCOPE_SYSTEM ) ||
0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
pthread_t pt ;
result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
}
pthread_attr_destroy( & attr );
return result ;
}
//----------------------------------------------------------------------------
bool ThreadsExec::is_process()
{
static const pthread_t master_pid = pthread_self();
return pthread_equal( master_pid , pthread_self() );
}
void ThreadsExec::global_lock()
{
pthread_mutex_lock( & host_internal_pthread_mutex );
}
void ThreadsExec::global_unlock()
{
pthread_mutex_unlock( & host_internal_pthread_mutex );
}
//----------------------------------------------------------------------------
void ThreadsExec::wait_yield( volatile int & flag , const int value )
{
while ( value == flag ) { sched_yield(); }
}
} // namespace Impl
} // namespace Kokkos
/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#elif defined( KOKKOS_HAVE_WINTHREAD )
/* Windows libraries */
#include <windows.h>
#include <process.h>
/* Standard C++ libaries */
#include <cstdlib>
#include <string>
#include <iostream>
#include <stdexcept>
#include <Kokkos_Threads.hpp>
//----------------------------------------------------------------------------
// Driver for each created pthread
namespace Kokkos {
namespace Impl {
namespace {
unsigned WINAPI internal_winthread_driver( void * arg )
{
ThreadsExec::driver();
return 0 ;
}
class ThreadLockWindows {
private:
CRITICAL_SECTION m_handle ;
~ThreadLockWindows()
{ DeleteCriticalSection( & m_handle ); }
ThreadLockWindows();
{ InitializeCriticalSection( & m_handle ); }
ThreadLockWindows( const ThreadLockWindows & );
ThreadLockWindows & operator = ( const ThreadLockWindows & );
public:
static ThreadLockWindows & singleton();
void lock()
{ EnterCriticalSection( & m_handle ); }
void unlock()
{ LeaveCriticalSection( & m_handle ); }
};
ThreadLockWindows & ThreadLockWindows::singleton()
{ static ThreadLockWindows self ; return self ; }
} // namespace <>
} // namespace Kokkos
} // namespace Impl
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
// Spawn this thread
bool ThreadsExec::spawn()
{
unsigned Win32ThreadID = 0 ;
HANDLE handle =
_beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
return ! handle ;
}
bool ThreadsExec::is_process() { return true ; }
void ThreadsExec::global_lock()
{ ThreadLockWindows::singleton().lock(); }
void ThreadsExec::global_unlock()
{ ThreadLockWindows::singleton().unlock(); }
void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
{
while ( value == flag ) { Sleep(0); }
}
} // namespace Impl
} // namespace Kokkos
#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -0,0 +1,730 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADSTEAM_HPP
#define KOKKOS_THREADSTEAM_HPP
#include <stdio.h>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <Kokkos_Atomic.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< class > struct ThreadsExecAdapter ;
//----------------------------------------------------------------------------
class ThreadsExecTeamMember {
private:
enum { TEAM_REDUCE_SIZE = 512 };
typedef Kokkos::Threads execution_space ;
typedef execution_space::scratch_memory_space space ;
ThreadsExec * const m_exec ;
ThreadsExec * const * m_team_base ; ///< Base for team fan-in
space m_team_shared ;
int m_team_shared_size ;
int m_team_size ;
int m_team_rank ;
int m_team_rank_rev ;
int m_league_size ;
int m_league_end ;
int m_league_rank ;
inline
void set_team_shared()
{ new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
public:
// Fan-in and wait until the matching fan-out is called.
// The root thread which does not wait will return true.
// All other threads will return false during the fan-out.
KOKKOS_INLINE_FUNCTION bool team_fan_in() const
{
int n , j ;
// Wait for fan-in threads
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
}
// If not root then wait for release
if ( m_team_rank_rev ) {
m_exec->state() = ThreadsExec::Rendezvous ;
Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
}
return ! m_team_rank_rev ;
}
KOKKOS_INLINE_FUNCTION void team_fan_out() const
{
int n , j ;
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
m_team_base[j]->state() = ThreadsExec::Active ;
}
}
public:
KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
{
team_fan_in();
team_fan_out();
}
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType& value, const int& thread_id) const
{
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ }
#else
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
, ValueType , void >::type type ;
if ( m_team_base ) {
type * const local_value = ((type*) m_team_base[0]->scratch_memory());
if(team_rank() == thread_id) *local_value = value;
memory_fence();
team_barrier();
value = *local_value;
}
#endif
}
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ;
if ( 0 == m_exec ) return value ;
*((volatile type*) m_exec->scratch_memory() ) = value ;
memory_fence();
type & accum = *((type *) m_team_base[0]->scratch_memory() );
if ( team_fan_in() ) {
for ( int i = 1 ; i < m_team_size ; ++i ) {
accum += *((type *) m_team_base[i]->scratch_memory() );
}
memory_fence();
}
team_fan_out();
return accum ;
}
#endif
#ifdef KOKKOS_HAVE_CXX11
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION ValueType
team_reduce( const ValueType & value
, const JoinOp & op_in ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ValueType(); }
#else
{
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
#endif
#else // KOKKOS_HAVE_CXX11
template< class JoinOp >
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return typename JoinOp::value_type(); }
#else
{
typedef typename JoinOp::value_type value_type;
#endif
#endif // KOKKOS_HAVE_CXX11
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
if ( 0 == m_exec ) return value ;
type * const local_value = ((type*) m_exec->scratch_memory());
// Set this thread's contribution
*local_value = value ;
// Fence to make sure the base team member has access:
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
type * const team_value = ((type*) m_team_base[0]->scratch_memory());
// Join to the team value:
for ( int i = 1 ; i < m_team_size ; ++i ) {
op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
}
// Team base thread may "lap" member threads so copy out to their local value.
for ( int i = 1 ; i < m_team_size ; ++i ) {
*((type*) m_team_base[i]->scratch_memory()) = *team_value ;
}
// Fence to make sure all team members have access
memory_fence();
}
team_fan_out();
// Value was changed by the team base
return *((type volatile const *) local_value);
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename ArgType >
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ArgType(); }
#else
{
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
if ( 0 == m_exec ) return type(0);
volatile type * const work_value = ((type*) m_exec->scratch_memory());
*work_value = value ;
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
// m_team_base[0] == highest ranking team member
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
//
// 1) copy from lower to higher rank, initialize lowest rank to zero
// 2) prefix sum from lowest to highest rank, skipping lowest rank
type accum = 0 ;
if ( global_accum ) {
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_team_base[i]->scratch_memory());
accum += val ;
}
accum = atomic_fetch_add( global_accum , accum );
}
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_team_base[i]->scratch_memory());
const type offset = accum ;
accum += val ;
val = offset ;
}
memory_fence();
}
team_fan_out();
return *work_value ;
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename ArgType >
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
{ return this-> template team_scan<ArgType>( value , 0 ); }
//----------------------------------------
// Private for the driver
template< class Arg0 , class Arg1 >
ThreadsExecTeamMember( Impl::ThreadsExec * exec
, const TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > & team
, const int shared_size )
: m_exec( exec )
, m_team_base(0)
, m_team_shared(0,0)
, m_team_shared_size( shared_size )
, m_team_size(0)
, m_team_rank(0)
, m_team_rank_rev(0)
, m_league_size(0)
, m_league_end(0)
, m_league_rank(0)
{
if ( team.league_size() ) {
// Execution is using device-team interface:
const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 );
const int team_rank_rev = pool_rank_rev % team.team_alloc();
// May be using fewer threads per team than a multiple of threads per core,
// some threads will idle.
if ( team_rank_rev < team.team_size() ) {
const size_t pool_league_size = m_exec->pool_size() / team.team_alloc() ;
const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
const size_t pool_league_rank = pool_league_size - ( pool_league_rank_rev + 1 );
m_team_base = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ;
m_team_size = team.team_size() ;
m_team_rank = team.team_size() - ( team_rank_rev + 1 );
m_team_rank_rev = team_rank_rev ;
m_league_size = team.league_size();
m_league_rank = ( team.league_size() * pool_league_rank ) / pool_league_size ;
m_league_end = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
set_team_shared();
}
}
}
ThreadsExecTeamMember()
: m_exec(0)
, m_team_base(0)
, m_team_shared(0,0)
, m_team_shared_size(0)
, m_team_size(1)
, m_team_rank(0)
, m_team_rank_rev(0)
, m_league_size(1)
, m_league_end(0)
, m_league_rank(0)
{}
inline
ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; }
bool valid() const
{ return m_league_rank < m_league_end ; }
void next()
{
if ( ++m_league_rank < m_league_end ) {
team_barrier();
set_team_shared();
}
}
void set_league_shmem( const int arg_league_rank
, const int arg_league_size
, const int arg_shmem_size
)
{
m_league_rank = arg_league_rank ;
m_league_size = arg_league_size ;
m_team_shared_size = arg_shmem_size ;
set_team_shared();
}
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< class Arg0 , class Arg1 >
class TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
{
private:
int m_league_size ;
int m_team_size ;
int m_team_alloc ;
inline
void init( const int league_size_request
, const int team_size_request )
{
const int pool_size = execution_space::thread_pool_size(0);
const int team_max = execution_space::thread_pool_size(1);
const int team_grain = execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
m_team_size = team_size_request < team_max ?
team_size_request : team_max ;
// Round team size up to a multiple of 'team_gain'
const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
const int team_count = pool_size / team_size_grain ;
// Constraint : pool_size = m_team_alloc * team_count
m_team_alloc = pool_size / team_count ;
}
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicy execution_policy ;
typedef Kokkos::Threads execution_space ;
typedef typename
Impl::if_c< ! Impl::is_same< Kokkos::Threads , Arg0 >::value , Arg0 , Arg1 >::type
work_tag ;
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return execution_space::thread_pool_size(1); }
template< class FunctorType >
static int team_size_recommended( const FunctorType & )
{ return execution_space::thread_pool_size(2); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType &, const int& )
{ return execution_space::thread_pool_size(2); }
//----------------------------------------
inline int team_size() const { return m_team_size ; }
inline int team_alloc() const { return m_team_alloc ; }
inline int league_size() const { return m_league_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
{ init(league_size_request,team_size_request); (void) vector_length_request; }
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
{ init(league_size_request,team_size_request); (void) vector_length_request; }
typedef Impl::ThreadsExecTeamMember member_type ;
friend class Impl::ThreadsExecTeamMember ;
};
} /* namespace Kokkos */
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType& count)
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
TeamThreadRange( const Impl::ThreadsExecTeamMember& thread
, const iType & begin
, const iType & end
)
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
#if defined( KOKKOS_HAVE_CXX11 )
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
}
#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_THREADSTEAM_HPP */

View File

@ -0,0 +1,427 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_THREADS_PARALLEL_HPP
#define KOKKOS_THREADS_PARALLEL_HPP
#include <vector>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
}
}
static void execute( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
exec.fan_in();
}
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
{
ThreadsExec::start( & ParallelFor::execute , this );
ThreadsExec::fence();
}
};
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
const FunctorType m_func ;
const Policy m_policy ;
const int m_shared ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( member ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( TagType() , member ); }
static void execute( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
for ( ; member.valid() ; member.next() ) {
self.ParallelFor::template driver< typename Policy::work_tag >( member );
}
exec.fan_in();
}
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
{
ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelFor::execute , this );
ThreadsExec::fence();
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
}
}
static void execute( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
driver( self.m_func
, ValueInit::init( self.m_func , exec.reduce_memory() )
, typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
);
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
}
public:
template< class HostViewType >
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const HostViewType & result_view )
: m_func( functor )
, m_policy( policy )
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
ThreadsExec::start( & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
ThreadsExec::fence();
if ( result_view.ptr_on_device() ) {
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
}
}
};
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_policy ;
const int m_shared ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( member , update ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( TagType() , member , update ); }
static void execute( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
for ( ; member.valid() ; member.next() ) {
self.ParallelReduce::template driver< work_tag >( member , update );
}
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
}
public:
ParallelReduce( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::execute , this );
ThreadsExec::fence();
}
template< class ViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const ViewType & result )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
ThreadsExec::fence();
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
}
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( i , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , final );
}
}
static void execute( ThreadsExec & exec , const void * arg )
{
const ParallelScan & self = * ((const ParallelScan *) arg );
const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
driver( self.m_func , update , false , range );
// exec.<FunctorType,work_tag>scan_large( self.m_func );
exec.template scan_small<FunctorType,work_tag>( self.m_func );
driver( self.m_func , update , true , range );
exec.fan_in();
}
public:
ParallelScan( const FunctorType & functor , const Policy & policy )
: m_func( functor )
, m_policy( policy )
{
ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
ThreadsExec::start( & ParallelScan::execute , this );
ThreadsExec::fence();
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */

View File

@ -0,0 +1,599 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#include <stdio.h>
#include <iostream>
#include <sstream>
#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD )
namespace Kokkos {
namespace Experimental {
namespace Impl {
typedef TaskMember< Kokkos::Threads , void , void > Task ;
namespace {
int volatile s_count_serial = 0 ;
int volatile s_count_team = 0 ;
Task * volatile s_ready_team = 0 ;
Task * volatile s_ready_serial = 0 ;
Task * const s_lock = reinterpret_cast<Task*>( ~((unsigned long)0) );
Task * const s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) - 1 );
} /* namespace */
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
namespace Kokkos {
namespace Experimental {
TaskPolicy< Kokkos::Threads >::TaskPolicy
( const unsigned arg_default_dependence_capacity
, const unsigned arg_team_size
)
: m_default_dependence_capacity( arg_default_dependence_capacity )
, m_team_size( arg_team_size )
{
const int threads_total = Threads::thread_pool_size(0);
const int threads_per_numa = Threads::thread_pool_size(1);
const int threads_per_core = Threads::thread_pool_size(2);
if ( 0 == arg_team_size ) {
// If a team task then claim for execution until count is zero
// Issue: team collectives cannot assume which pool members are in the team.
// Issue: team must only span a single NUMA region.
// If more than one thread per core then map cores to work team,
// else map numa to work team.
if ( 1 < threads_per_core ) m_team_size = threads_per_core ;
else if ( 1 < threads_per_numa ) m_team_size = threads_per_numa ;
else m_team_size = 1 ;
}
// Verify a valid team size
const bool valid_team_size =
( 0 < m_team_size && m_team_size <= threads_total ) &&
(
( 1 == m_team_size ) ||
( threads_per_core == m_team_size ) ||
( threads_per_numa == m_team_size )
);
if ( ! valid_team_size ) {
std::ostringstream msg ;
msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Threads > ERROR"
<< " invalid team_size(" << m_team_size << ")"
<< " threads_per_core(" << threads_per_core << ")"
<< " threads_per_numa(" << threads_per_numa << ")"
<< " threads_total(" << threads_total << ")"
;
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
TaskPolicy< Kokkos::Threads >::member_type &
TaskPolicy< Kokkos::Threads >::member_single()
{
static member_type s ;
return s ;
}
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Threads > & policy )
{
typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
enum { BASE_SHMEM = 1024 };
void * const arg = reinterpret_cast<void*>( long( policy.m_team_size ) );
Kokkos::Impl::ThreadsExec::resize_scratch( 0 , member_type::team_reduce_size() + BASE_SHMEM );
Kokkos::Impl::ThreadsExec::start( & Impl::Task::execute_ready_tasks_driver , arg );
Kokkos::Impl::ThreadsExec::fence();
}
} /* namespace Experimental */
} /* namespace Kokkos */
namespace Kokkos {
namespace Experimental {
namespace Impl {
//----------------------------------------------------------------------------
void Task::throw_error_verify_type()
{
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::verify_type ERROR");
}
void Task::deallocate( void * ptr )
{
free( ptr );
}
void * Task::allocate( const unsigned n )
{
void * const ptr = malloc(n);
return ptr ;
}
Task::~TaskMember()
{
}
//----------------------------------------------------------------------------
void Task::reschedule()
{
// Reschedule transitions from executing back to waiting.
const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_EXECUTING) , int(TASK_STATE_WAITING) );
if ( old_state != int(TASK_STATE_EXECUTING) ) {
fprintf( stderr
, "reschedule ERROR task[%lx] state(%d)\n"
, (unsigned long) this
, old_state
);
fflush(stderr);
}
}
void Task::schedule()
{
//----------------------------------------
// State is either constructing or already waiting.
// If constructing then transition to waiting.
{
const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_CONSTRUCTING) , int(TASK_STATE_WAITING) );
Task * const waitTask = *((Task * volatile const *) & m_wait );
Task * const next = *((Task * volatile const *) & m_next );
if ( s_denied == waitTask || 0 != next ||
( old_state != int(TASK_STATE_CONSTRUCTING) &&
old_state != int(TASK_STATE_WAITING) ) ) {
fprintf(stderr,"Task::schedule task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
, (unsigned long) this
, old_state
, (unsigned long) waitTask
, (unsigned long) next );
fflush(stderr);
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::Task spawn or respawn state error");
}
}
//----------------------------------------
// Insert this task into another dependence that is not complete
// Push on to the wait queue, fails if ( s_denied == m_dep[i]->m_wait )
bool insert_in_ready_queue = true ;
for ( int i = 0 ; i < m_dep_size && insert_in_ready_queue ; ) {
Task * const task_dep = m_dep[i] ;
Task * const head_value_old = *((Task * volatile *) & task_dep->m_wait );
if ( s_denied == head_value_old ) {
// Wait queue is closed, try again with the next queue
++i ;
}
else {
// Wait queue is open and not locked.
// If CAS succeeds then have acquired the lock.
// Have exclusive access to this task.
// Assign m_next assuming a successfull insertion into the queue.
// Fence the memory assignment before attempting the CAS.
*((Task * volatile *) & m_next ) = head_value_old ;
memory_fence();
// Attempt to insert this task into the queue
Task * const wait_queue_head = atomic_compare_exchange( & task_dep->m_wait , head_value_old , this );
if ( head_value_old == wait_queue_head ) {
insert_in_ready_queue = false ;
}
}
}
//----------------------------------------
// All dependences are complete, insert into the ready list
if ( insert_in_ready_queue ) {
// Increment the count of ready tasks.
// Count is decremented when task is complete.
Task * volatile * queue = 0 ;
if ( m_serial ) {
atomic_increment( & s_count_serial );
queue = & s_ready_serial ;
}
else {
atomic_increment( & s_count_team );
queue = & s_ready_team ;
}
while ( insert_in_ready_queue ) {
Task * const head_value_old = *queue ;
if ( s_lock != head_value_old ) {
// Read the head of ready queue, if same as previous value then CAS locks the ready queue
// Only access via CAS
// Have exclusive access to this task, assign to head of queue, assuming successful insert
// Fence assignment before attempting insert.
*((Task * volatile *) & m_next ) = head_value_old ;
memory_fence();
Task * const ready_queue_head = atomic_compare_exchange( queue , head_value_old , this );
if ( head_value_old == ready_queue_head ) {
// Successful insert
insert_in_ready_queue = false ; // done
}
}
}
}
}
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void Task::assign( Task ** const lhs_ptr , Task * rhs )
{
// Increment rhs reference count.
if ( rhs ) { atomic_increment( & rhs->m_ref_count ); }
// Assign the pointer and retrieve the previous value.
Task * const old_lhs = atomic_exchange( lhs_ptr , rhs );
if ( old_lhs ) {
// Decrement former lhs reference count.
// If reference count is zero task must be complete, then delete task.
// Task is ready for deletion when wait == s_denied
int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
// if 'count != 0' then 'old_lhs' may be deallocated before dereferencing
Task * const wait = count == 0 ? *((Task * const volatile *) & old_lhs->m_wait ) : (Task*) 0 ;
if ( count < 0 || ( count == 0 && wait != s_denied ) ) {
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Threads>::assign ERROR deleting" ;
fprintf( stderr , "%s task(0x%lx) m_ref_count(%d) , m_wait(0x%ld)\n"
, msg_error_header
, (unsigned long) old_lhs
, count
, (unsigned long) wait );
fflush(stderr);
Kokkos::Impl::throw_runtime_exception( msg_error_header );
}
if ( count == 0 ) {
// When 'count == 0' this thread has exclusive access to 'old_lhs'
const Task::function_dealloc_type d = old_lhs->m_dealloc ;
(*d)( old_lhs );
}
}
}
#endif
//----------------------------------------------------------------------------
Task * Task::get_dependence( int i ) const
{
Task * const t = m_dep[i] ;
if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
fprintf( stderr
, "TaskMember< Threads >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
, (unsigned long) this
, m_state
, m_dep_size
, i
, (unsigned long) t
);
fflush( stderr );
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::get_dependence ERROR");
}
return t ;
}
//----------------------------------------------------------------------------
void Task::add_dependence( Task * before )
{
if ( before != 0 ) {
int const state = *((volatile const int *) & m_state );
// Can add dependence during construction or during execution
if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
Kokkos::Experimental::TASK_STATE_EXECUTING == state ) &&
m_dep_size < m_dep_capacity ) {
++m_dep_size ;
assign( m_dep + (m_dep_size-1) , before );
memory_fence();
}
else {
fprintf( stderr
, "TaskMember< Threads >::add_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) m_dep_capacity(%d) }\n"
, (unsigned long) this
, m_state
, m_dep_size
, m_dep_capacity
);
fflush( stderr );
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::add_dependence ERROR");
}
}
}
//----------------------------------------------------------------------------
void Task::clear_dependence()
{
for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
assign( m_dep + i , 0 );
}
*((volatile int *) & m_dep_size ) = 0 ;
memory_fence();
}
//----------------------------------------------------------------------------
Task * Task::pop_ready_task( Task * volatile * const queue )
{
Task * const task_old = *queue ;
if ( s_lock != task_old && 0 != task_old ) {
Task * const task = atomic_compare_exchange( queue , task_old , s_lock );
if ( task_old == task ) {
// May have acquired the lock and task.
// One or more other threads may have acquired this same task and lock
// due to respawning ABA race condition.
// Can only be sure of acquire with a successful state transition from waiting to executing
const int old_state = atomic_compare_exchange( & task->m_state, int(TASK_STATE_WAITING), int(TASK_STATE_EXECUTING) );
if ( old_state == int(TASK_STATE_WAITING) ) {
// Transitioned this task from waiting to executing
// Update the queue to the next entry and release the lock
Task * const next_old = *((Task * volatile *) & task->m_next );
Task * const s = atomic_compare_exchange( queue , s_lock , next_old );
if ( s != s_lock ) {
fprintf(stderr,"Task::pop_ready_task( 0x%lx ) UNLOCK ERROR\n", (unsigned long) queue );
fflush(stderr);
}
*((Task * volatile *) & task->m_next ) = 0 ;
return task ;
}
else {
fprintf(stderr,"Task::pop_ready_task( 0x%lx ) task(0x%lx) state(%d) ERROR\n"
, (unsigned long) queue
, (unsigned long) task
, old_state );
fflush(stderr);
}
}
}
return (Task *) 0 ;
}
void Task::complete_executed_task( Task * task , volatile int * const queue_count )
{
// State is either executing or if respawned then waiting,
// try to transition from executing to complete.
// Reads the current value.
const int state_old =
atomic_compare_exchange( & task->m_state
, int(Kokkos::Experimental::TASK_STATE_EXECUTING)
, int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
if ( Kokkos::Experimental::TASK_STATE_WAITING == state_old ) {
task->schedule(); /* Task requested a respawn so reschedule it */
}
else if ( Kokkos::Experimental::TASK_STATE_EXECUTING != state_old ) {
fprintf( stderr
, "TaskMember< Threads >::execute_serial completion ERROR : task[%lx]{ state_old(%d) dep_size(%d) }\n"
, (unsigned long) & task
, state_old
, task->m_dep_size
);
fflush( stderr );
}
else {
// Clear dependences of this task before locking wait queue
task->clear_dependence();
// Stop other tasks from adding themselves to this task's wait queue.
// The wait queue is updated concurrently so guard with an atomic.
// Setting the wait queue to denied denotes delete-ability of the task by any thread.
// Therefore, once 'denied' the task pointer must be treated as invalid.
Task * wait_queue = *((Task * volatile *) & task->m_wait );
Task * wait_queue_old = 0 ;
do {
wait_queue_old = wait_queue ;
wait_queue = atomic_compare_exchange( & task->m_wait , wait_queue_old , s_denied );
} while ( wait_queue_old != wait_queue );
task = 0 ;
// Pop waiting tasks and schedule them
while ( wait_queue ) {
Task * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
x->schedule();
}
}
atomic_decrement( queue_count );
}
//----------------------------------------------------------------------------
void Task::execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & exec , const void * arg )
{
typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
// Whole pool is calling this function
// Create the thread team member with shared memory for the given task.
const int team_size = reinterpret_cast<long>( arg );
member_type member( & exec , TeamPolicy< Kokkos::Threads >( 1 , team_size ) , 0 );
Kokkos::Impl::ThreadsExec & exec_team_base = member.threads_exec_team_base();
Task * volatile * const task_team_ptr = reinterpret_cast<Task**>( exec_team_base.reduce_memory() );
if ( member.team_fan_in() ) {
*task_team_ptr = 0 ;
Kokkos::memory_fence();
}
member.team_fan_out();
long int iteration_count = 0 ;
// Each team must iterate this loop synchronously to insure team-execution of team-task
while ( 0 < s_count_serial || 0 < s_count_team ) {
if ( member.team_rank() == 0 ) {
// Only one team member attempts to pop a team task
*task_team_ptr = pop_ready_task( & s_ready_team );
}
// Query if team acquired a team task
Task * const task_team = *task_team_ptr ;
if ( task_team ) {
// Set shared memory
member.set_league_shmem( 0 , 1 , task_team->m_shmem_size );
(*task_team->m_team)( task_team , member );
// Do not proceed until all members have completed the task,
// the task has been completed or rescheduled, and
// the team task pointer has been cleared.
if ( member.team_fan_in() ) {
complete_executed_task( task_team , & s_count_team );
*task_team_ptr = 0 ;
Kokkos::memory_fence();
}
member.team_fan_out();
}
else {
Task * const task_serial = pop_ready_task( & s_ready_serial );
if ( task_serial ) {
if ( task_serial->m_serial ) (*task_serial->m_serial)( task_serial );
complete_executed_task( task_serial , & s_count_serial );
}
}
++iteration_count ;
}
exec.fan_in();
}
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */

View File

@ -0,0 +1,584 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_THREADS_TASKPOLICY_HPP
#define KOKKOS_THREADS_TASKPOLICY_HPP
#include <Kokkos_Threads.hpp>
#include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD )
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Base class for all Kokkos::Threads tasks */
template<>
class TaskMember< Kokkos::Threads , void , void > {
public:
typedef void (* function_dealloc_type)( TaskMember * );
typedef TaskMember * (* function_verify_type) ( TaskMember * );
typedef void (* function_single_type) ( TaskMember * );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::ThreadsExecTeamMember & );
private:
// Needed to disambiguate references to base class variables
// without triggering a false-positive on Intel compiler warning #955.
typedef TaskMember< Kokkos::Threads , void , void > SelfType ;
function_dealloc_type m_dealloc ; ///< Deallocation
function_verify_type m_verify ; ///< Result type verification
function_team_type m_team ; ///< Apply function
function_single_type m_serial ; ///< Apply function
TaskMember ** m_dep ; ///< Dependences
TaskMember * m_wait ; ///< Linked list of tasks waiting on this task
TaskMember * m_next ; ///< Linked list of tasks waiting on a different task
int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_shmem_size ;
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
// 7 pointers + 5 integers
#if defined( KOKKOS_HAVE_CXX11 )
TaskMember( const TaskMember & ) = delete ;
TaskMember & operator = ( const TaskMember & ) = delete ;
#else
TaskMember( const TaskMember & );
TaskMember & operator = ( const TaskMember & );
#endif
static void * allocate( const unsigned arg_size );
static void deallocate( void * );
template< class DerivedTaskType >
static
void deallocate( TaskMember * t )
{
DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
ptr->~DerivedTaskType();
deallocate( (void*) ptr );
}
static TaskMember * pop_ready_task( TaskMember * volatile * const queue );
static void complete_executed_task( TaskMember * , volatile int * const );
static void throw_error_verify_type();
protected:
TaskMember()
: m_dealloc(0)
, m_verify(0)
, m_team(0)
, m_serial(0)
, m_dep(0)
, m_wait(0)
, m_next(0)
, m_dep_capacity(0)
, m_dep_size(0)
, m_shmem_size(0)
, m_ref_count(0)
, m_state(0)
{}
public:
static void execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & , const void * );
~TaskMember();
template< typename ResultType >
KOKKOS_FUNCTION static
TaskMember * verify_type( TaskMember * t )
{
enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
if ( check_type && t != 0 ) {
// Verify that t->m_verify is this function
const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
if ( t->m_verify != self ) {
t = 0 ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
throw_error_verify_type();
#endif
}
}
return t ;
}
//----------------------------------------
/* Inheritence Requirements on task types:
*
* class DerivedTaskType
* : public TaskMember< Threads , DerivedType::value_type , FunctorType >
* { ... };
*
* class TaskMember< Threads , DerivedType::value_type , FunctorType >
* : public TaskMember< Threads , DerivedType::value_type , void >
* , public Functor
* { ... };
*
* If value_type != void
* class TaskMember< Threads , value_type , void >
* : public TaskMember< Threads , void , void >
*
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
*
*/
//----------------------------------------
template< class DerivedTaskType , class Tag >
KOKKOS_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
, TaskMember * >::type t )
{
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename DerivedTaskType::result_type result_type ;
DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
Kokkos::Impl::FunctorApply< functor_type , Tag , result_type & >
::apply( (functor_type &) self , & self.m_result );
}
template< class DerivedTaskType , class Tag >
KOKKOS_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
, TaskMember * >::type t )
{
typedef typename DerivedTaskType::functor_type functor_type ;
DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
Kokkos::Impl::FunctorApply< functor_type , Tag , void >::apply( (functor_type &) self );
}
//----------------------------------------
template< class DerivedTaskType , class Tag >
KOKKOS_FUNCTION static
void apply_team( typename Kokkos::Impl::enable_if<(
Kokkos::Impl::is_same<Tag,void>::value
&&
Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
), TaskMember * >::type t
, Kokkos::Impl::ThreadsExecTeamMember & member
)
{
DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
self.DerivedTaskType::functor_type::apply( member );
}
/** \brief Allocate and construct a task */
template< class DerivedTaskType , class Tag >
KOKKOS_FUNCTION static
void apply_team( typename Kokkos::Impl::enable_if<(
Kokkos::Impl::is_same<Tag,void>::value
&&
! Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
), TaskMember * >::type t
, Kokkos::Impl::ThreadsExecTeamMember & member
)
{
DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
self.DerivedTaskType::functor_type::apply( member , self.m_result );
}
//----------------------------------------
/** \brief Allocate and construct a task */
template< class DerivedTaskType , class Tag >
static
TaskMember * create( const typename DerivedTaskType::functor_type & arg_functor
, const function_team_type arg_apply_team
, const function_single_type arg_apply_single
, const unsigned arg_team_shmem
, const unsigned arg_dependence_capacity
)
{
enum { padding_size = sizeof(DerivedTaskType) % sizeof(TaskMember*)
? sizeof(TaskMember*) - sizeof(DerivedTaskType) % sizeof(TaskMember*) : 0 };
enum { derived_size = sizeof(DerivedTaskType) + padding_size };
DerivedTaskType * const task =
new( allocate( derived_size + sizeof(TaskMember*) * arg_dependence_capacity ) )
DerivedTaskType( arg_functor );
task->SelfType::m_dealloc = & TaskMember::template deallocate< DerivedTaskType > ;
task->SelfType::m_verify = & TaskMember::template verify_type< typename DerivedTaskType::value_type > ;
task->SelfType::m_team = arg_apply_team ;
task->SelfType::m_serial = arg_apply_single ;
task->SelfType::m_dep = (TaskMember**)( ((unsigned char *)task) + derived_size );
task->SelfType::m_dep_capacity = arg_dependence_capacity ;
task->SelfType::m_shmem_size = arg_team_shmem ;
task->SelfType::m_state = TASK_STATE_CONSTRUCTING ;
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) task->SelfType::m_dep[i] = 0 ;
return static_cast< TaskMember * >( task );
}
void reschedule();
void schedule();
//----------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
static
void assign( TaskMember ** const lhs , TaskMember * const rhs );
#else
KOKKOS_INLINE_FUNCTION static
void assign( TaskMember ** const lhs , TaskMember * const rhs ) {}
#endif
TaskMember * get_dependence( int i ) const ;
KOKKOS_INLINE_FUNCTION
int get_dependence() const
{ return m_dep_size ; }
void clear_dependence();
void add_dependence( TaskMember * before );
//----------------------------------------
typedef FutureValueTypeIsVoidError get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return get_result_type() ; }
KOKKOS_INLINE_FUNCTION
Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
};
/** \brief A Future< Kokkos::Threads , ResultType > will cast
* from TaskMember< Kokkos::Threads , void , void >
* to TaskMember< Kokkos::Threads , ResultType , void >
* to query the result.
*/
template< class ResultType >
class TaskMember< Kokkos::Threads , ResultType , void >
: public TaskMember< Kokkos::Threads , void , void >
{
public:
typedef ResultType result_type ;
result_type m_result ;
typedef const result_type & get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return m_result ; }
inline
TaskMember() : TaskMember< Kokkos::Threads , void , void >(), m_result() {}
#if defined( KOKKOS_HAVE_CXX11 )
TaskMember( const TaskMember & ) = delete ;
TaskMember & operator = ( const TaskMember & ) = delete ;
#else
private:
TaskMember( const TaskMember & );
TaskMember & operator = ( const TaskMember & );
#endif
};
/** \brief Callback functions will cast
* from TaskMember< Kokkos::Threads , void , void >
* to TaskMember< Kokkos::Threads , ResultType , FunctorType >
* to execute work functions.
*/
template< class ResultType , class FunctorType >
class TaskMember< Kokkos::Threads , ResultType , FunctorType >
: public TaskMember< Kokkos::Threads , ResultType , void >
, public FunctorType
{
public:
typedef ResultType result_type ;
typedef FunctorType functor_type ;
inline
TaskMember( const functor_type & arg_functor )
: TaskMember< Kokkos::Threads , ResultType , void >()
, functor_type( arg_functor )
{}
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
void wait( TaskPolicy< Kokkos::Threads > & );
template<>
class TaskPolicy< Kokkos::Threads >
{
public:
typedef Kokkos::Threads execution_space ;
typedef TaskPolicy execution_policy ;
typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
private:
typedef Impl::TaskMember< Kokkos::Threads , void , void > task_root_type ;
int m_default_dependence_capacity ;
int m_team_size ; ///< Fixed size of a task-team
template< class FunctorType >
static inline
const task_root_type * get_task_root( const FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
}
template< class FunctorType >
static inline
task_root_type * get_task_root( FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< task_root_type * >( static_cast< task_type * >(f) );
}
public:
// Valid team sizes are 1,
// Threads::pool_size(1) == threads per numa, or
// Threads::pool_size(2) == threads per core
TaskPolicy( const unsigned arg_default_dependence_capacity = 4
, const unsigned arg_team_size = 0 /* default from thread pool topology */
);
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs )
: m_default_dependence_capacity( rhs.m_default_dependence_capacity )
, m_team_size( rhs.m_team_size )
{}
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs
, const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity )
, m_team_size( rhs.m_team_size )
{}
TaskPolicy & operator = ( const TaskPolicy &rhs ) {
m_default_dependence_capacity = rhs.m_default_dependence_capacity;
m_team_size = rhs.m_team_size;
return *this;
}
// Create serial-thread task
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create< task_type , void >
( functor
, task_root_type::function_team_type(0)
, & task_root_type::template apply_single< task_type , void >
, 0
, ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
)
#endif
);
}
// Create thread-team task
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create_team( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create< task_type , void >
( functor
, & task_root_type::template apply_team< task_type , void >
, task_root_type::function_single_type(0)
, Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value( functor , m_team_size )
, ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
)
#endif
);
}
template< class A1 , class A2 , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( const Future<A1,A2> & after
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
&&
Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
after.m_task->add_dependence( before.m_task );
#endif
}
template< class FunctorType , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( FunctorType * task_functor
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->add_dependence( before.m_task ); }
#else
{}
#endif
template< class ValueType >
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
f.m_task->schedule();
#endif
return f ;
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void respawn( FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->reschedule(); }
#else
{}
#endif
//----------------------------------------
// Functions for an executing task functor to query dependences,
// set new dependences, and respawn itself.
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< void , execution_space >
get_dependence( const FunctorType * task_functor , int i ) const
{
return Future<void,execution_space>(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
get_task_root(task_functor)->get_dependence(i)
#endif
);
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
int get_dependence( const FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return get_task_root(task_functor)->get_dependence(); }
#else
{ return 0 ; }
#endif
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void clear_dependence( FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->clear_dependence(); }
#else
{}
#endif
//----------------------------------------
static member_type & member_single();
friend void wait( TaskPolicy< Kokkos::Threads > & );
};
} /* namespace Experimental */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */

View File

@ -0,0 +1,275 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
namespace Kokkos {
namespace Experimental {
namespace Impl {
bool
SharedAllocationRecord< void , void >::
is_sane( SharedAllocationRecord< void , void > * arg_record )
{
constexpr static SharedAllocationRecord * zero = 0 ;
SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ;
bool ok = root != 0 && root->m_count == 0 ;
if ( ok ) {
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 );
for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
const bool ok_non_null = rec && rec->m_prev && ( rec == root || rec->m_next );
const bool ok_root = ok_non_null && rec->m_root == root ;
const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec );
const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ;
const bool ok_count = ok_non_null && 0 <= rec->m_count ;
ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
if ( ! ok ) {
fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n"
, reinterpret_cast< unsigned long >( rec )
, rec->m_count
, reinterpret_cast< unsigned long >( rec->m_root )
, reinterpret_cast< unsigned long >( rec->m_next )
, reinterpret_cast< unsigned long >( rec->m_prev )
, reinterpret_cast< unsigned long >( rec->m_next->m_prev )
, reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
);
}
}
if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) {
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane unlocking");
}
}
return ok ;
}
SharedAllocationRecord<void,void> *
SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr )
{
constexpr static SharedAllocationRecord * zero = 0 ;
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 );
// Iterate searching for the record with this data pointer
SharedAllocationRecord * r = root_next ;
while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; }
if ( r == arg_root ) { r = 0 ; }
if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) {
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
}
return r ;
}
/**\brief Construct and insert into 'arg_root' tracking set.
* use_count is zero.
*/
SharedAllocationRecord< void , void >::
SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
, SharedAllocationHeader * arg_alloc_ptr
, size_t arg_alloc_size
, SharedAllocationRecord< void , void >::function_type arg_dealloc
)
: m_alloc_ptr( arg_alloc_ptr )
, m_alloc_size( arg_alloc_size )
, m_dealloc( arg_dealloc )
, m_root( arg_root )
, m_prev( 0 )
, m_next( 0 )
, m_count( 0 )
{
constexpr static SharedAllocationRecord * zero = 0 ;
// Insert into the root double-linked list for tracking
//
// before: arg_root->m_next == next ; next->m_prev == arg_root
// after: arg_root->m_next == this ; this->m_prev == arg_root ;
// this->m_next == next ; next->m_prev == this
m_prev = m_root ;
// Read root->m_next and lock by setting to zero
while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 );
m_next->m_prev = this ;
if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) {
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
}
}
void
SharedAllocationRecord< void , void >::
increment( SharedAllocationRecord< void , void > * arg_record )
{
const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 );
if ( old_count < 0 ) { // Error
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed increment");
}
}
SharedAllocationRecord< void , void > *
SharedAllocationRecord< void , void >::
decrement( SharedAllocationRecord< void , void > * arg_record )
{
constexpr static SharedAllocationRecord * zero = 0 ;
const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 );
if ( old_count == 1 ) {
// before: arg_record->m_prev->m_next == arg_record &&
// arg_record->m_next->m_prev == arg_record
//
// after: arg_record->m_prev->m_next == arg_record->m_next &&
// arg_record->m_next->m_prev == arg_record->m_prev
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 );
arg_record->m_next->m_prev = arg_record->m_prev ;
if ( root_next != arg_record ) {
arg_record->m_prev->m_next = arg_record->m_next ;
}
else {
// before: arg_record->m_root == arg_record->m_prev
// after: arg_record->m_root == arg_record->m_next
root_next = arg_record->m_next ;
}
// Unlock the list:
if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) {
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement unlocking");
}
arg_record->m_next = 0 ;
arg_record->m_prev = 0 ;
function_type d = arg_record->m_dealloc ;
(*d)( arg_record );
arg_record = 0 ;
}
else if ( old_count < 1 ) { // Error
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement count");
}
return arg_record ;
}
void
SharedAllocationRecord< void , void >::
print_host_accessible_records( std::ostream & s
, const char * const space_name
, const SharedAllocationRecord * const root
, const bool detail )
{
const SharedAllocationRecord< void , void > * r = root ;
char buffer[256] ;
if ( detail ) {
do {
snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
, space_name
, reinterpret_cast<unsigned long>( r )
, reinterpret_cast<unsigned long>( r->m_prev )
, reinterpret_cast<unsigned long>( r->m_next )
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
, r->m_alloc_size
, r->m_count
, reinterpret_cast<unsigned long>( r->m_dealloc )
, r->m_alloc_ptr->m_label
);
std::cout << buffer ;
r = r->m_next ;
} while ( r != root );
}
else {
do {
if ( r->m_alloc_ptr ) {
snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n"
, space_name
, reinterpret_cast< unsigned long >( r->data() )
, r->size()
, r->m_alloc_ptr->m_label
);
}
else {
snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
}
std::cout << buffer ;
r = r->m_next ;
} while ( r != root );
}
}
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */

View File

@ -0,0 +1,287 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
namespace Kokkos {
namespace Experimental {
namespace Impl {
template< class MemorySpace = void , class DestroyFunctor = void >
class SharedAllocationRecord ;
class SharedAllocationHeader {
private:
typedef SharedAllocationRecord<void,void> Record ;
static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*);
template< class , class > friend class SharedAllocationRecord ;
Record * m_record ;
char m_label[ maximum_label_length ];
public:
/* Given user memory get pointer to the header */
KOKKOS_INLINE_FUNCTION static
const SharedAllocationHeader * get_header( void * alloc_ptr )
{ return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
};
template<>
class SharedAllocationRecord< void , void > {
protected:
static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" );
template< class , class > friend class SharedAllocationRecord ;
typedef void (* function_type )( SharedAllocationRecord<void,void> * );
SharedAllocationHeader * const m_alloc_ptr ;
size_t const m_alloc_size ;
function_type const m_dealloc ;
SharedAllocationRecord * const m_root ;
SharedAllocationRecord * m_prev ;
SharedAllocationRecord * m_next ;
int m_count ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
/**\brief Construct and insert into 'arg_root' tracking set.
* use_count is zero.
*/
SharedAllocationRecord( SharedAllocationRecord * arg_root
, SharedAllocationHeader * arg_alloc_ptr
, size_t arg_alloc_size
, function_type arg_dealloc
);
public:
~SharedAllocationRecord() = default ;
constexpr SharedAllocationRecord()
: m_alloc_ptr( 0 )
, m_alloc_size( 0 )
, m_dealloc( 0 )
, m_root( this )
, m_prev( this )
, m_next( this )
, m_count( 0 )
{}
static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ;
KOKKOS_INLINE_FUNCTION
const SharedAllocationHeader * head() const { return m_alloc_ptr ; }
/* User's memory begins at the end of the header */
KOKKOS_INLINE_FUNCTION
void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); }
/* User's memory begins at the end of the header */
constexpr size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; }
/* Cannot be 'constexpr' because 'm_count' is volatile */
int use_count() const { return m_count ; }
/* Increment use count */
static void increment( SharedAllocationRecord * );
/* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */
static SharedAllocationRecord * decrement( SharedAllocationRecord * );
/* Given a root record and data pointer find the record */
static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const );
/* Sanity check for the whole set of records to which the input record belongs.
* Locks the set's insert/erase operations until the sanity check is complete.
*/
static bool is_sane( SharedAllocationRecord * );
/* Print host-accessible records */
static void print_host_accessible_records( std::ostream &
, const char * const space_name
, const SharedAllocationRecord * const root
, const bool detail );
};
/*
* Memory space specialization of SharedAllocationRecord< Space , void > requires :
*
* SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void >
* {
* // delete allocated user memory via static_cast to this type.
* static void deallocate( const SharedAllocationRecord<void,void> * );
* Space m_space ;
* }
*/
template< class MemorySpace , class DestroyFunctor >
class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
{
private:
static void deallocate( SharedAllocationRecord<void,void> * record_ptr )
{ delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); }
SharedAllocationRecord( const MemorySpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc
)
/* Allocate user memory as [ SharedAllocationHeader , user_memory ] */
: SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate )
, m_destroy()
{}
~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); }
public:
DestroyFunctor m_destroy ;
// Allocate with a zero use count. Incrementing the use count from zero to one
// inserts the record into the tracking list. Decrementing the count from one to zero
// removes from the trakcing list and deallocates.
KOKKOS_INLINE_FUNCTION static
SharedAllocationRecord * allocate( const MemorySpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc );
#else
return (SharedAllocationRecord *) 0 ;
#endif
}
};
union SharedAllocationTracker {
private:
typedef SharedAllocationRecord<void,void> Record ;
enum : unsigned long {
DO_NOT_DEREF_FLAG = 0x01ul
};
// The allocation record resides in Host memory space
Record * m_record ;
unsigned long m_record_bits;
KOKKOS_INLINE_FUNCTION
static Record * disable( Record * rec )
{ return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); }
KOKKOS_INLINE_FUNCTION
void increment() const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
#endif
}
KOKKOS_INLINE_FUNCTION
void decrement() const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
#endif
}
public:
KOKKOS_INLINE_FUNCTION
constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {}
template< class MemorySpace >
constexpr
SharedAllocationRecord< MemorySpace , void > & get_record() const
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
template< class MemorySpace >
std::string get_label() const
{ return static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record )->get_label(); }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( Record * arg_record )
: m_record( arg_record ) { increment(); }
KOKKOS_INLINE_FUNCTION
~SharedAllocationTracker() { decrement(); }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( const SharedAllocationTracker & rhs )
: m_record( rhs.m_record ) { increment(); }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( SharedAllocationTracker && rhs )
: m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
{
decrement();
m_record = rhs.m_record ;
increment();
return *this ;
}
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
{
m_record = rhs.m_record ;
rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
return *this ;
}
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */

View File

@ -0,0 +1,416 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
struct WithoutInitializing_t {};
struct AllowPadding_t {};
template< class ... Parameters >
struct ViewAllocProp ;
template<>
struct ViewAllocProp<> {
struct NullSpace {};
typedef std::false_type allow_padding_t ;
typedef std::true_type initialize_t ;
typedef NullSpace memory_space ;
typedef NullSpace execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp()
: label()
, memory()
, execution()
, allow_padding()
, initialize()
{}
ViewAllocProp( const std::string & arg_label )
: label( arg_label )
, memory()
, execution()
, allow_padding()
, initialize()
{}
};
template< class ... Parameters >
struct ViewAllocProp< const char * , Parameters ... >
{
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef typename base_prop_type::memory_space memory_space ;
typedef typename base_prop_type::execution_space execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const char * const arg_label , Parameters ... arg_param )
: label( arg_label )
, memory( base_prop_type( arg_param ... ).memory )
, execution( base_prop_type( arg_param ... ).execution )
, allow_padding()
, initialize()
{}
};
template< class ... Parameters >
struct ViewAllocProp< std::string , Parameters ... >
{
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef typename base_prop_type::memory_space memory_space ;
typedef typename base_prop_type::execution_space execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const std::string & arg_label , Parameters ... arg_param )
: label( arg_label )
, memory( base_prop_type( arg_param ... ).memory )
, execution( base_prop_type( arg_param ... ).execution )
, allow_padding()
, initialize()
{}
};
template< class ... Parameters >
struct ViewAllocProp< WithoutInitializing_t , Parameters ... >
{
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef std::false_type initialize_t ;
typedef typename base_prop_type::memory_space memory_space ;
typedef typename base_prop_type::execution_space execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const WithoutInitializing_t & , Parameters ... arg_param )
: label( base_prop_type( arg_param ... ).label )
, memory( base_prop_type( arg_param ... ).memory )
, execution( base_prop_type( arg_param ... ).execution )
, allow_padding()
, initialize()
{}
};
template< class ... Parameters >
struct ViewAllocProp< AllowPadding_t , Parameters ... >
{
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef std::true_type allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef typename base_prop_type::memory_space memory_space ;
typedef typename base_prop_type::execution_space execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const AllowPadding_t & , Parameters ... arg_param )
: label( base_prop_type( arg_param ... ).label )
, memory( base_prop_type( arg_param ... ).memory )
, execution( base_prop_type( arg_param ... ).execution )
, allow_padding()
, initialize()
{}
};
template< class Space , class ... Parameters >
struct ViewAllocProp< Space , Parameters ... >
{
enum { is_exec = Kokkos::Impl::is_execution_space< Space >::value };
enum { is_mem = Kokkos::Impl::is_memory_space< Space >::value };
static_assert( is_exec || is_mem , "View allocation given unknown parameter" );
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef typename std::conditional< is_mem , Space , typename base_prop_type::memory_space >::type memory_space ;
typedef typename std::conditional< is_exec , Space , typename base_prop_type::execution_space >::type execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
// Templated so that 'base_prop_type( args ... ).execution'
// is not used unless arg_space == memory_space.
template< class ... Args >
ViewAllocProp( const memory_space & arg_space , Args ... args )
: label( base_prop_type( args ... ).label )
, memory( arg_space )
, execution( base_prop_type( args ... ).execution )
, allow_padding()
, initialize()
{}
// Templated so that 'base_prop_type( args ... ).memory'
// is not used unless arg_space == execution_space.
template< class ... Args >
ViewAllocProp( const execution_space & arg_space , Args ... args )
: label( base_prop_type( args ... ).label )
, memory( base_prop_type( args ... ).memory )
, execution( arg_space )
, allow_padding()
, initialize()
{}
};
template< class ExecSpace , class MemSpace >
struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , std::string >
{
typedef ViewAllocProp<> base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef MemSpace memory_space ;
typedef ExecSpace execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const std::string & arg_label )
: label( arg_label )
, memory()
, execution()
, allow_padding()
, initialize()
{}
};
template< class ExecSpace , class MemSpace , unsigned N >
struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , char[N] >
{
typedef ViewAllocProp<> base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef MemSpace memory_space ;
typedef ExecSpace execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const char * const arg_label )
: label( arg_label )
, memory()
, execution()
, allow_padding()
, initialize()
{}
};
// Deprecate in favor of view_alloc( Kokkos::WithoutInitializing )
template< class ExecSpace , class MemSpace >
struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
, Kokkos::ViewAllocateWithoutInitializing
>
{
typedef ViewAllocProp<> base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef std::false_type initialize_t ;
typedef MemSpace memory_space ;
typedef ExecSpace execution_space ;
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
ViewAllocProp( const Kokkos::ViewAllocateWithoutInitializing & arg )
: label( arg.label )
, memory()
, execution()
, allow_padding()
, initialize()
{}
};
template< class ExecSpace , class MemSpace , class ... Parameters >
struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
, ViewAllocProp< Parameters ... >
>
{
typedef ViewAllocProp< Parameters ... > base_prop_type ;
typedef typename base_prop_type::allow_padding_t allow_padding_t ;
typedef typename base_prop_type::initialize_t initialize_t ;
typedef MemSpace memory_space ;
typedef
typename std::conditional
< Kokkos::Impl::is_execution_space< typename base_prop_type::execution_space >::value
, typename base_prop_type::execution_space
, ExecSpace
>::type execution_space ;
static_assert( std::is_same< typename base_prop_type::memory_space , ViewAllocProp<>::NullSpace >::value ||
std::is_same< typename base_prop_type::memory_space , memory_space >::value
, "View allocation given incompatible memory space" );
static_assert( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename execution_space::memory_space
, memory_space >::value
, "View allocation given incompatible execution space" );
const std::string label ;
const memory_space memory ;
const execution_space execution ;
const allow_padding_t allow_padding ;
const initialize_t initialize ;
// If the input properties have a memory or execution space then copy construct those spaces
// otherwise default construct those spaces.
template< class P >
ViewAllocProp( const P & arg_prop
, typename std::enable_if
< std::is_same< P , base_prop_type >::value &&
Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
Kokkos::Impl::is_execution_space< typename P::memory_space >::value
>::type * = 0 )
: label( arg_prop.label )
, memory( arg_prop.memory )
, execution( arg_prop.execution )
, allow_padding()
, initialize()
{}
template< class P >
ViewAllocProp( const P & arg_prop
, typename std::enable_if
< std::is_same< P , base_prop_type >::value &&
Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
>::type * = 0 )
: label( arg_prop.label )
, memory( arg_prop.memory )
, execution()
, allow_padding()
, initialize()
{}
template< class P >
ViewAllocProp( const P & arg_prop
, typename std::enable_if
< std::is_same< P , base_prop_type >::value &&
! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
Kokkos::Impl::is_execution_space< typename P::execution_space >::value
>::type * = 0 )
: label( arg_prop.label )
, memory()
, execution( arg_prop.execution )
, allow_padding()
, initialize()
{}
template< class P >
ViewAllocProp( const P & arg_prop
, typename std::enable_if
< std::is_same< P , base_prop_type >::value &&
! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
>::type * = 0 )
: label( arg_prop.label )
, memory()
, execution()
, allow_padding()
, initialize()
{}
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,844 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Singleton.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include <utility>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <iomanip>
/* Enable clean up of memory leaks */
#define CLEAN_UP_MEMORY_LEAKS 0
namespace Kokkos { namespace Impl {
namespace {
//-----------------------------------------------------------------------------
// AllocationRecord
//-----------------------------------------------------------------------------
//
// Used to track details about an allocation and provide a ref count
// sizeof(AllocationRecord) == 128
struct AllocationRecord
{
enum {
OFFSET = sizeof(AllocatorBase*) // allocator
+ sizeof(void*) // alloc_ptr
+ sizeof(uint64_t) // alloc_size
+ sizeof(AllocatorAttributeBase*) // attribute
+ sizeof(uint32_t) // node_index
+ sizeof(uint32_t) // ref_count
, LABEL_LENGTH = 128 - OFFSET
};
AllocatorBase * const allocator;
void * const alloc_ptr;
const uint64_t alloc_size;
AllocatorAttributeBase * const attribute;
const int32_t node_index;
volatile uint32_t ref_count;
const char label[LABEL_LENGTH];
AllocationRecord( AllocatorBase * const arg_allocator
, void * arg_alloc_ptr
, uint64_t arg_alloc_size
, int32_t arg_node_index
, const std::string & arg_label
)
: allocator(arg_allocator)
, alloc_ptr(arg_alloc_ptr)
, alloc_size(arg_alloc_size)
, attribute(NULL)
, node_index(arg_node_index)
, ref_count(1)
, label() // zero fill
{
const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
strncpy( const_cast<char *>(label), arg_label.c_str(), length );
}
~AllocationRecord()
{
if (attribute) {
delete attribute;
}
}
uint32_t increment_ref_count()
{
uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
return old_value + 1u;
}
uint32_t decrement_ref_count()
{
uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
return old_value - 1u;
}
void print( std::ostream & oss ) const
{
oss << "{ " << allocator->name()
<< " } : \"" << label
<< "\" ref_count(" << ref_count
<< ") memory[ " << alloc_ptr
<< " + " << alloc_size
<< " ]" ;
}
bool set_attribute( AllocatorAttributeBase * attr )
{
bool result = false;
if (attribute == NULL) {
result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute)
, reinterpret_cast<AllocatorAttributeBase *>(NULL)
, attr );
}
return result;
}
// disallow copy and assignment
AllocationRecord( const AllocationRecord & );
AllocationRecord & operator=(const AllocationRecord &);
};
template <int NumBlocks>
struct Bitset
{
enum { blocks = NumBlocks };
enum { size = blocks * 64 };
enum { block_mask = 63u };
enum { block_shift = 6 };
// used to find free bits in a bitset
static int count_trailing_zeros(uint64_t x)
{
#if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
return x ? __builtin_ctzll(x) : 64;
#elif defined( KOKKOS_COMPILER_INTEL )
enum { shift = 32 };
enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
(x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
64 ;
#elif defined( KOKKOS_COMPILER_IBM )
return x ? __cnttz8(x) : 64;
#else
int i = 0;
for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
return i;
#endif
}
Bitset()
: m_bits()
{
for (int i=0; i < blocks; ++i) {
m_bits[i] = 0u;
}
}
bool set( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
}
bool reset( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
}
bool test( int i )
{
const uint64_t block = m_bits[ i >> block_shift ];
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return block & bit;
}
int find_first_unset() const
{
for (int i=0; i < blocks; ++i) {
const uint64_t block = m_bits[i];
int b = count_trailing_zeros( ~block );
if ( b < 64 ) {
return (i << block_shift) + b;
}
}
return size;
}
volatile uint64_t m_bits[blocks];
};
//-----------------------------------------------------------------------------
// AllocationRecordPool -- singleton class
//
// global_alloc_rec_pool is the ONLY instance of this class
//
//-----------------------------------------------------------------------------
// Record AllocationRecords in a lock-free circular list.
// Each node in the list has a buffer with space for 959 ((15*64)-1) records
// managed by a bitset. Atomics are used to set and reset bits in the bit set.
// The head of the list is atomically updated to the last node found with
// unused space.
//
// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
// Cost to destroy an allocation recored: O(1)
//
// Singleton allocations are pushed onto a lock-free stack that is destroyed
// after the circular list of allocation records.
struct AllocationRecordPool
{
enum { BITSET_BLOCKS = 15 };
typedef Bitset<BITSET_BLOCKS> bitset_type;
enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
struct AllocationNode
{
AllocationNode()
: next()
, bitset()
, buffer()
{
// set the first bit to used
bitset.set(0);
}
void * get_buffer( int32_t node_index )
{
return buffer + (node_index-1) * sizeof(AllocationRecord);
}
// return 0 if no space is available in the node
int32_t get_node_index()
{
int32_t node_index = 0;
do {
node_index = bitset.find_first_unset();
// successfully claimed a bit
if ( node_index != bitset.size && bitset.set(node_index) )
{
return node_index;
}
} while ( node_index != bitset.size );
return 0;
}
void clear_node_index( int32_t node_index )
{
bitset.reset(node_index);
}
AllocationNode * next;
bitset_type bitset;
char buffer[BUFFER_SIZE];
};
struct SingletonNode
{
void * buffer;
SingletonNode * next;
Impl::singleton_destroy_function_type destroy;
SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
: buffer(NULL)
, next(NULL)
, destroy(destroy_func)
{
if (size) {
buffer = malloc(size);
create_func(buffer);
}
}
~SingletonNode()
{
if (buffer) {
try {
destroy(buffer);
} catch(...) {}
free(buffer);
}
}
};
AllocationRecordPool()
: head( new AllocationNode() )
, singleton_head(NULL)
{
// setup ring
head->next = head;
}
~AllocationRecordPool()
{
// delete allocation records
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
#if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
// print node bitset
for (int i=0; i < bitset_type::blocks; ++i ) {
std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " ";
}
std::cout << std::endl;
#endif
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
#if CLEAN_UP_MEMORY_LEAKS
/* Cleaning up memory leaks prevents memory error detection tools
* from reporting the original source of allocation, which can
* impede debugging with such tools.
*/
try {
destroy(alloc_rec);
}
catch(...) {}
#endif
}
}
curr->next = NULL;
delete curr;
curr = next;
} while ( curr != start );
if ( !string_vec.empty() ) {
std::sort( string_vec.begin(), string_vec.end() );
std::ostringstream oss;
oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
for (size_t i=0; i< string_vec.size(); ++i)
{
oss << " " << string_vec[i] << std::endl;
}
std::cerr << oss.str() << std::endl;
}
}
// delete singletons
{
SingletonNode * curr = singleton_head;
while (curr) {
SingletonNode * next = curr->next;
delete curr;
curr = next;
}
}
}
AllocationRecord * create( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
AllocationNode * start = volatile_load(&head);
AllocationNode * curr = start;
int32_t node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
while (node_index == 0 && curr != start)
{
node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
}
// Need to allocate and insert a new node
if (node_index == 0 && curr == start)
{
AllocationNode * new_node = new AllocationNode();
node_index = new_node->get_node_index();
AllocationNode * next = NULL;
do {
next = volatile_load(&curr->next);
new_node->next = next;
memory_fence();
} while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
curr = new_node;
}
void * buffer = curr->get_buffer(node_index);
// try to set head to curr
if ( start != curr )
{
atomic_compare_exchange( & head, start, curr );
}
return new (buffer) AllocationRecord( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, node_index
, arg_label
);
}
void destroy( AllocationRecord * alloc_rec )
{
if (alloc_rec) {
const int32_t node_index = alloc_rec->node_index;
AllocationNode * node = get_node( alloc_rec );
// deallocate memory
alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
// call destructor
alloc_rec->~AllocationRecord();
// wait for writes to complete
memory_fence();
// clear node index
node->clear_node_index( node_index );
}
}
void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
{
SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
SingletonNode * next;
// insert new node at the head of the list
do {
next = volatile_load(&singleton_head);
node->next = next;
} while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
return node->buffer;
}
void print_memory( std::ostream & out ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
}
}
curr = next;
} while ( curr != start );
if ( !string_vec.empty() ) {
std::sort( string_vec.begin(), string_vec.end() );
std::ostringstream oss;
oss << "Tracked Memory:" << std::endl;
for (size_t i=0; i< string_vec.size(); ++i)
{
oss << " " << string_vec[i] << std::endl;
}
out << oss.str() << std::endl;
}
else {
out << "No Tracked Memory" << std::endl;
}
}
// find an AllocationRecord such that
// alloc_ptr <= ptr < alloc_ptr + alloc_size
// otherwise return NULL
AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
char const * const char_ptr = reinterpret_cast<const char *>(ptr);
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
if ( (allocator == alloc_rec->allocator)
&& (alloc_ptr <= char_ptr)
&& (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
{
return alloc_rec;
}
}
}
curr = next;
} while ( curr != start );
return NULL;
}
private:
AllocationNode * get_node( AllocationRecord * alloc_rec )
{
return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
}
AllocationNode * head;
SingletonNode * singleton_head;
};
// create the global pool for allocation records
AllocationRecordPool global_alloc_rec_pool;
// convert a uintptr_t to an AllocationRecord pointer
inline
AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
{
return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
}
} // unnamed namespace
//-----------------------------------------------------------------------------
// Allocation Tracker methods
//-----------------------------------------------------------------------------
// Create a reference counted AllocationTracker
void AllocationTracker::initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
// create record
AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, arg_label
);
m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
}
void AllocationTracker::reallocate( size_t size ) const
{
AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
if ( NULL != the_alloc_ptr )
{
*const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
*const_cast<uint64_t *>(&rec->alloc_size) = size;
}
else {
Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
}
}
void AllocationTracker::increment_ref_count() const
{
to_alloc_rec( m_alloc_rec )->increment_ref_count();
}
void AllocationTracker::decrement_ref_count() const
{
AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
uint32_t the_ref_count = alloc_rec->decrement_ref_count();
if (the_ref_count == 0u) {
try {
global_alloc_rec_pool.destroy( alloc_rec );
}
catch(...) {}
}
}
namespace {
struct NullAllocator { static const char * name() { return "Null Allocator"; } };
}
AllocatorBase * AllocationTracker::allocator() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->allocator;
}
return Allocator<NullAllocator>::singleton();
}
void * AllocationTracker::alloc_ptr() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_ptr;
}
return NULL;
}
size_t AllocationTracker::alloc_size() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_size;
}
return 0u;
}
size_t AllocationTracker::ref_count() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->ref_count;
}
return 0u;
}
char const * AllocationTracker::label() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->label;
}
return "[Empty Allocation Tracker]";
}
void AllocationTracker::print( std::ostream & oss) const
{
if (m_alloc_rec & REF_COUNT_MASK) {
to_alloc_rec(m_alloc_rec)->print(oss);
}
else {
oss << label();
}
}
bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
{
bool result = false;
if (m_alloc_rec & REF_COUNT_MASK) {
result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
}
return result;
}
AllocatorAttributeBase * AllocationTracker::attribute() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->attribute;
}
return NULL;
}
void AllocationTracker::print_tracked_memory( std::ostream & out )
{
global_alloc_rec_pool.print_memory( out );
}
AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
{
AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
AllocationTracker tracker;
if ( alloc_rec != NULL )
{
if ( tracking_enabled() ) {
alloc_rec->increment_ref_count();
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
else {
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
}
}
return tracker ;
}
//-----------------------------------------------------------------------------
// static AllocationTracker
//-----------------------------------------------------------------------------
#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
namespace {
// TODO : Detect compiler support for thread local variables
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_PTHREAD )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_SERIAL )
bool g_thread_local_tracking_enabled = true;
#endif
} // unnamed namespace
void AllocationTracker::disable_tracking()
{
g_thread_local_tracking_enabled = false;
}
void AllocationTracker::enable_tracking()
{
g_thread_local_tracking_enabled = true;
}
bool AllocationTracker::tracking_enabled()
{
return g_thread_local_tracking_enabled;
}
#else
namespace {
enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
}
void AllocationTracker::disable_tracking()
{
if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already disabled");
}
}
void AllocationTracker::enable_tracking()
{
if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already enabled");
}
}
bool AllocationTracker::tracking_enabled()
{
return g_tracking_enabled == TRACKING_ENABLED;
}
#endif
//-----------------------------------------------------------------------------
// create singleton free function
//-----------------------------------------------------------------------------
void * create_singleton( size_t size
, Impl::singleton_create_function_type create_func
, Impl::singleton_destroy_function_type destroy_func )
{
return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
}
}} // namespace Kokkos::Impl
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */

View File

@ -0,0 +1,586 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
#define KOKKOS_ALLOCATION_TRACKER_HPP
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h>
#include <cstdlib>
#include <string>
#include <iosfwd>
namespace Kokkos { namespace Impl {
//-----------------------------------------------------------------------------
// Create Singleton objects
//-----------------------------------------------------------------------------
typedef void * (*singleton_create_function_type)(void * buffer);
typedef void (*singleton_destroy_function_type)(void *);
void * create_singleton( size_t size
, singleton_create_function_type create_func
, singleton_destroy_function_type destroy_func
);
/// class Singleton
///
/// Default construct a singleton type. This method is used to circumvent
/// order of construction issues. Singleton objects are destroyed after all
/// other allocations in the reverse order of their creation.
template <typename Type>
class Singleton
{
public:
/// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
static Type * get()
{
static Type * singleton = NULL;
if (singleton == NULL) {
Impl::singleton_create_function_type create_func = &create;
Impl::singleton_destroy_function_type destroy_func = &destroy;
singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
}
return singleton;
}
private:
/// Call the Type constructor
static void destroy(void * ptr)
{
reinterpret_cast<Type*>(ptr)->~Type();
}
/// placement new the Type in buffer
static void * create(void * buffer)
{
return new (buffer) Type();
}
};
//-----------------------------------------------------------------------------
// AllocatorBase
//-----------------------------------------------------------------------------
/// class AllocatorBase
///
/// Abstract base class for all Allocators.
/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
/// to avoid order of destruction issues
class AllocatorBase
{
public:
/// name of the allocator
/// used to report memory leaks
virtual const char * name() const = 0;
/// Allocate a buffer of size number of bytes
virtual void* allocate(size_t size) const = 0;
/// Deallocate a buffer with size number of bytes
/// The pointer must have been allocated with a call to corresponding allocate
virtual void deallocate(void * ptr, size_t size) const = 0;
/// Changes the size of the memory block pointed to by ptr.
/// Ptr must have been allocated with the corresponding allocate call
/// The function may move the memory block to a new location
/// (whose address is returned by the function).
///
/// The content of the memory block is preserved up to the lesser of the new and
/// old sizes, even if the block is moved to a new location. If the new size is larger,
/// the value of the newly allocated portion is indeterminate.
///
/// In case that ptr is a null pointer, the function behaves like allocate, assigning a
/// new block of size bytes and returning a pointer to its beginning.
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
/// can a texture object be bound to the allocated memory
virtual bool support_texture_binding() const = 0;
/// virtual destructor
virtual ~AllocatorBase() {}
};
/// class AllocatorAttributeBase
class AllocatorAttributeBase
{
public:
virtual ~AllocatorAttributeBase() {}
};
//-----------------------------------------------------------------------------
// Allocator< StaticAllocator > : public AllocatorBase
//-----------------------------------------------------------------------------
// HasStaticName
template<typename T>
class HasStaticName
{
typedef const char * (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::name>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticName<T>::value, const char *>::type
allocator_name()
{
return T::name();
}
template <typename T>
inline
typename enable_if<!HasStaticName<T>::value, const char *>::type
allocator_name()
{
return "Unnamed Allocator";
}
// HasStaticAllocate
template<typename T>
class HasStaticAllocate
{
typedef void * (*static_method)(size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::allocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t size)
{
return T::allocate(size);
}
template <typename T>
inline
typename enable_if<!HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot allocate memory!") );
return NULL;
}
// HasStaticDeallocate
template<typename T>
class HasStaticDeallocate
{
typedef void (*static_method)(void *, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void * ptr, size_t size)
{
T::deallocate(ptr,size);
}
template <typename T>
inline
typename enable_if<!HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void *, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot deallocate memory!") );
}
// HasStaticReallocate
template<typename T>
class HasStaticReallocate
{
typedef void * (*static_method)(void *, size_t, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
return T::reallocate(old_ptr, old_size, new_size);
}
template <typename T>
inline
typename enable_if<!HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void *, size_t, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot reallocate memory!") );
return NULL;
}
// HasStaticReallocate
template<typename T>
class HasStaticSupportTextureBinding
{
typedef bool (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return T::support_texture_binding();
}
template <typename T>
inline
typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return false;
}
template <typename T>
class Allocator : public AllocatorBase
{
public:
virtual const char * name() const
{
return allocator_name<T>();
}
virtual void* allocate(size_t size) const
{
return allocator_allocate<T>(size);
}
virtual void deallocate(void * ptr, size_t size) const
{
allocator_deallocate<T>(ptr,size);
}
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
{
return allocator_reallocate<T>(old_ptr, old_size, new_size);
}
virtual bool support_texture_binding() const
{
return allocator_support_texture_binding<T>();
}
static AllocatorBase * singleton()
{
return Singleton< Allocator<T> >::get();
}
};
//-----------------------------------------------------------------------------
// AllocationTracker
//-----------------------------------------------------------------------------
// forward declaration for friend classes
struct CopyWithoutTracking;
struct MallocHelper;
/// class AllocationTracker
/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
/// Reference counting is disabled when the host is in parallel.
class AllocationTracker
{
// use the least significant bit of the AllocationRecord pointer to indicate if the
// AllocationTracker should reference count
enum {
REF_COUNT_BIT = static_cast<uintptr_t>(1)
, REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
};
public:
/// Find an AllocationTracker such that
/// alloc_ptr <= ptr < alloc_ptr + alloc_size
/// O(n) where n is the number of tracked allocations.
template <typename StaticAllocator>
static AllocationTracker find( void const * ptr )
{
return find( ptr, Allocator<StaticAllocator>::singleton() );
}
/// Pretty print all the currently tracked memory
static void print_tracked_memory( std::ostream & out );
/// Default constructor
KOKKOS_INLINE_FUNCTION
AllocationTracker()
: m_alloc_rec(0)
{}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label = std::string("") )
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, size_t arg_alloc_size
, const std::string & arg_label = std::string("")
)
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Copy an AllocatorTracker
KOKKOS_INLINE_FUNCTION
AllocationTracker( const AllocationTracker & rhs )
: m_alloc_rec( rhs.m_alloc_rec)
{
#if !defined( __CUDA_ARCH__ )
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
#endif
}
/// Copy an AllocatorTracker
/// Decrement the reference count of the current tracker if necessary
KOKKOS_INLINE_FUNCTION
AllocationTracker & operator=( const AllocationTracker & rhs )
{
if (this != &rhs) {
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
m_alloc_rec = rhs.m_alloc_rec;
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
#endif
}
return * this;
}
/// Destructor
/// Decrement the reference count if necessary
KOKKOS_INLINE_FUNCTION
~AllocationTracker()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
}
/// Is the tracker valid?
KOKKOS_INLINE_FUNCTION
bool is_valid() const
{
return (m_alloc_rec & REF_COUNT_MASK);
}
/// clear the tracker
KOKKOS_INLINE_FUNCTION
void clear()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
m_alloc_rec = 0;
}
/// is this tracker currently counting allocations?
KOKKOS_INLINE_FUNCTION
bool ref_counting() const
{
return (m_alloc_rec & REF_COUNT_BIT);
}
AllocatorBase * allocator() const;
/// pointer to the allocated memory
void * alloc_ptr() const;
/// size in bytes of the allocated memory
size_t alloc_size() const;
/// the current reference count
size_t ref_count() const;
/// the label given to the allocation
char const * label() const;
/// pretty print all the tracker's information to the std::ostream
void print( std::ostream & oss) const;
/// set an attribute ptr on the allocation record
/// the arg_attribute pointer will be deleted when the record is destroyed
/// the attribute ptr can only be set once
bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
/// get the attribute ptr from the allocation record
AllocatorAttributeBase * attribute() const;
/// reallocate the memory tracked by this allocation
/// NOT thread-safe
void reallocate( size_t size ) const;
private:
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
void initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, std::string const & label );
void increment_ref_count() const;
void decrement_ref_count() const;
static void disable_tracking();
static void enable_tracking();
static bool tracking_enabled();
friend struct Impl::CopyWithoutTracking;
friend struct Impl::MallocHelper;
uintptr_t m_alloc_rec;
};
/// Make a copy of the functor with reference counting disabled
struct CopyWithoutTracking
{
template <typename Functor>
static Functor apply( const Functor & f )
{
AllocationTracker::disable_tracking();
Functor func(f);
AllocationTracker::enable_tracking();
return func;
}
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_ALLOCATION_TRACKER_HPP

View File

@ -0,0 +1,260 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ANALYZESHAPE_HPP
#define KOKKOS_ANALYZESHAPE_HPP
#include <impl/Kokkos_Shape.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
/** \brief Analyze the array shape defined by a Kokkos::View data type.
*
* It is presumed that the data type can be mapped down to a multidimensional
* array of an intrinsic scalar numerical type (double, float, int, ... ).
* The 'value_type' of an array may be an embedded aggregate type such
* as a fixed length array 'Array<T,N>'.
* In this case the 'array_intrinsic_type' represents the
* underlying array of intrinsic scalar numerical type.
*
* The embedded aggregate type must have an AnalyzeShape specialization
* to map it down to a shape and intrinsic scalar numerical type.
*/
template< class T >
struct AnalyzeShape : public Shape< sizeof(T) , 0 >
{
typedef void specialize ;
typedef Shape< sizeof(T), 0 > shape ;
typedef T array_intrinsic_type ;
typedef T value_type ;
typedef T type ;
typedef const T const_array_intrinsic_type ;
typedef const T const_value_type ;
typedef const T const_type ;
typedef T non_const_array_intrinsic_type ;
typedef T non_const_value_type ;
typedef T non_const_type ;
};
template<>
struct AnalyzeShape<void> : public Shape< 0 , 0 >
{
typedef void specialize ;
typedef Shape< 0 , 0 > shape ;
typedef void array_intrinsic_type ;
typedef void value_type ;
typedef void type ;
typedef const void const_array_intrinsic_type ;
typedef const void const_value_type ;
typedef const void const_type ;
typedef void non_const_array_intrinsic_type ;
typedef void non_const_value_type ;
typedef void non_const_type ;
};
template< class T >
struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
{
private:
typedef AnalyzeShape<T> nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename nested::shape shape ;
typedef typename nested::const_array_intrinsic_type array_intrinsic_type ;
typedef typename nested::const_value_type value_type ;
typedef typename nested::const_type type ;
typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type const_type ;
typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type non_const_type ;
};
template< class T >
struct AnalyzeShape< T * >
: public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
{
private:
typedef AnalyzeShape<T> nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
typedef typename nested::value_type value_type ;
typedef typename nested::type * type ;
typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type * const_type ;
typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type * non_const_type ;
};
template< class T >
struct AnalyzeShape< T[] >
: public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
{
private:
typedef AnalyzeShape<T> nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
typedef typename nested::array_intrinsic_type array_intrinsic_type [] ;
typedef typename nested::value_type value_type ;
typedef typename nested::type type [] ;
typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [] ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type const_type [] ;
typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [] ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type non_const_type [] ;
};
template< class T >
struct AnalyzeShape< const T[] >
: public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
{
private:
typedef AnalyzeShape< const T > nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
typedef typename nested::array_intrinsic_type array_intrinsic_type [] ;
typedef typename nested::value_type value_type ;
typedef typename nested::type type [] ;
typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [] ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type const_type [] ;
typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [] ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type non_const_type [] ;
};
template< class T , unsigned N >
struct AnalyzeShape< T[N] >
: public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
{
private:
typedef AnalyzeShape<T> nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
typedef typename nested::array_intrinsic_type array_intrinsic_type [N] ;
typedef typename nested::value_type value_type ;
typedef typename nested::type type [N] ;
typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [N] ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type const_type [N] ;
typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [N] ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type non_const_type [N] ;
};
template< class T , unsigned N >
struct AnalyzeShape< const T[N] >
: public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
{
private:
typedef AnalyzeShape< const T > nested ;
public:
typedef typename nested::specialize specialize ;
typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
typedef typename nested::array_intrinsic_type array_intrinsic_type [N] ;
typedef typename nested::value_type value_type ;
typedef typename nested::type type [N] ;
typedef typename nested::const_array_intrinsic_type const_array_intrinsic_type [N] ;
typedef typename nested::const_value_type const_value_type ;
typedef typename nested::const_type const_type [N] ;
typedef typename nested::non_const_array_intrinsic_type non_const_array_intrinsic_type [N] ;
typedef typename nested::non_const_value_type non_const_value_type ;
typedef typename nested::non_const_type non_const_type [N] ;
};
} // namespace Impl
} // namespace Kokkos
#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */

View File

@ -0,0 +1,214 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP )
#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP
namespace Kokkos {
#ifdef KOKKOS_ENABLE_ASM
#ifndef __CUDA_ARCH__
template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<char>(volatile char* a) {
__asm__ __volatile__(
"lock incb %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<short>(volatile short* a) {
__asm__ __volatile__(
"lock incw %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<int>(volatile int* a) {
__asm__ __volatile__(
"lock incl %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_increment<long long int>(volatile long long int* a) {
__asm__ __volatile__(
"lock incq %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<char>(volatile char* a) {
__asm__ __volatile__(
"lock decb %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<short>(volatile short* a) {
__asm__ __volatile__(
"lock decw %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<int>(volatile int* a) {
__asm__ __volatile__(
"lock decl %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
template<>
KOKKOS_INLINE_FUNCTION
void atomic_decrement<long long int>(volatile long long int* a) {
__asm__ __volatile__(
"lock decq %0"
: /* no output registers */
: "m" (a[0])
: "memory"
);
}
#endif
#endif
namespace Impl {
struct cas128_t
{
uint64_t lower;
uint64_t upper;
KOKKOS_INLINE_FUNCTION
cas128_t () {
lower = 0;
upper = 0;
}
KOKKOS_INLINE_FUNCTION
cas128_t (const cas128_t& a) {
lower = a.lower;
upper = a.upper;
}
KOKKOS_INLINE_FUNCTION
cas128_t (volatile cas128_t* a) {
lower = a->lower;
upper = a->upper;
}
KOKKOS_INLINE_FUNCTION
bool operator != (const cas128_t& a) const {
return (lower != a.lower) || upper!=a.upper;
}
KOKKOS_INLINE_FUNCTION
void operator = (const cas128_t& a) {
lower = a.lower;
upper = a.upper;
}
KOKKOS_INLINE_FUNCTION
void operator = (const cas128_t& a) volatile {
lower = a.lower;
upper = a.upper;
}
}
__attribute__ (( __aligned__( 16 ) ));
inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp, cas128_t swap )
{
#ifdef KOKKOS_ENABLE_ASM
bool swapped = false;
__asm__ __volatile__
(
"lock cmpxchg16b %1\n\t"
"setz %0"
: "=q" ( swapped )
, "+m" ( *ptr )
, "+d" ( cmp.upper )
, "+a" ( cmp.lower )
: "c" ( swap.upper )
, "b" ( swap.lower )
, "q" ( swapped )
);
return cmp;
#else
cas128_t tmp(ptr);
if(tmp != cmp) {
return tmp;
} else {
*ptr = swap;
return swap;
}
#endif
}
}
}
#endif

View File

@ -0,0 +1,259 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
// Must cast-away 'volatile' for the CAS call.
#if defined( KOKKOS_ATOMICS_USE_CUDA )
__inline__ __device__
int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
{ return atomicCAS((int*)dest,compare,val); }
__inline__ __device__
unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
{ return atomicCAS((unsigned int*)dest,compare,val); }
__inline__ __device__
unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
const unsigned long long int compare ,
const unsigned long long int val )
{ return atomicCAS((unsigned long long int*)dest,compare,val); }
template < typename T >
__inline__ __device__
T atomic_compare_exchange( volatile T * const dest , const T & compare ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
{
const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
return *((T*)&tmp);
}
template < typename T >
__inline__ __device__
T atomic_compare_exchange( volatile T * const dest , const T & compare ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
{
typedef unsigned long long int type ;
const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
return *((T*)&tmp);
}
template < typename T >
__inline__ __device__
T atomic_compare_exchange( volatile T * const dest , const T & compare ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
return_val = *dest;
if( return_val == compare )
*dest = val;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
}
//----------------------------------------------------------------------------
// GCC native CAS supports int, long, unsigned int, unsigned long.
// Intel native CAS support int and long with the same interface as GCC.
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
KOKKOS_INLINE_FUNCTION
int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
{ return __sync_val_compare_and_swap(dest,compare,val); }
KOKKOS_INLINE_FUNCTION
long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
{ return __sync_val_compare_and_swap(dest,compare,val); }
#if defined( KOKKOS_ATOMICS_USE_GCC )
// GCC supports unsigned
KOKKOS_INLINE_FUNCTION
unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
{ return __sync_val_compare_and_swap(dest,compare,val); }
KOKKOS_INLINE_FUNCTION
unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
const unsigned long compare ,
const unsigned long val )
{ return __sync_val_compare_and_swap(dest,compare,val); }
#endif
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange( volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
#else
union U {
int i ;
T t ;
} tmp ;
#endif
tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
return tmp.t ;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange( volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(long) , const T & >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
long i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
#else
union U {
long i ;
T t ;
} tmp ;
#endif
tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
return tmp.t ;
}
#ifdef KOKKOS_ENABLE_ASM
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange( volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) != sizeof(long) &&
sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
{
union U {
Impl::cas128_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} tmp ;
tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
return tmp.t ;
}
#endif
template < typename T >
inline
T atomic_compare_exchange( volatile T * const dest , const T compare ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM)
&& ( sizeof(T) != 16 )
#endif
, const T >::type& val )
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
if( return_val == compare ) {
const T tmp = *dest = val;
#ifndef KOKKOS_COMPILER_CLANG
(void) tmp;
#endif
}
Impl::unlock_address_host_space( (void*) dest );
return return_val;
}
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template< typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
{
T retval;
#pragma omp critical
{
retval = dest[0];
if ( retval == compare )
dest[0] = val;
}
return retval;
}
#endif
template <typename T>
KOKKOS_INLINE_FUNCTION
bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
{
return compare == atomic_compare_exchange(dest, compare, val);
}
//----------------------------------------------------------------------------
} // namespace Kokkos
#endif

View File

@ -0,0 +1,340 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
#define KOKKOS_ATOMIC_EXCHANGE_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_ATOMICS_USE_CUDA )
__inline__ __device__
int atomic_exchange( volatile int * const dest , const int val )
{
// return __iAtomicExch( (int*) dest , val );
return atomicExch( (int*) dest , val );
}
__inline__ __device__
unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
{
// return __uAtomicExch( (unsigned int*) dest , val );
return atomicExch( (unsigned int*) dest , val );
}
__inline__ __device__
unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
{
// return __ullAtomicExch( (unsigned long long*) dest , val );
return atomicExch( (unsigned long long*) dest , val );
}
/** \brief Atomic exchange for any type with compatible size */
template< typename T >
__inline__ __device__
T atomic_exchange(
volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
{
// int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
return *((T*)&tmp);
}
template< typename T >
__inline__ __device__
T atomic_exchange(
volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
{
typedef unsigned long long int type ;
// type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
return *((T*)&tmp);
}
template < typename T >
__inline__ __device__
T atomic_exchange( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
return_val = *dest;
*dest = val;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
}
/** \brief Atomic exchange for any type with compatible size */
template< typename T >
__inline__ __device__
void atomic_assign(
volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
{
// (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
(void) atomicExch( ((int*)dest) , *((int*)&val) );
}
template< typename T >
__inline__ __device__
void atomic_assign(
volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
{
typedef unsigned long long int type ;
// (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
(void) atomicExch( ((type*)dest) , *((type*)&val) );
}
template< typename T >
__inline__ __device__
void atomic_assign(
volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) != sizeof(unsigned long long int)
, const T & >::type val )
{
(void) atomic_exchange(dest,val);
}
//----------------------------------------------------------------------------
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
template< typename T >
KOKKOS_INLINE_FUNCTION
T atomic_exchange( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
, const T & >::type val )
{
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
const type v = *((type*)&val); // Extract to be sure the value doesn't change
type assumed ;
#ifdef KOKKOS_HAVE_CXX11
union U {
T val_T ;
type val_type ;
KOKKOS_INLINE_FUNCTION U() {};
} old ;
#else
union { T val_T ; type val_type ; } old ;
#endif
old.val_T = *dest ;
do {
assumed = old.val_type ;
old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
} while ( assumed != old.val_type );
return old.val_T ;
}
#if defined(KOKKOS_ENABLE_ASM)
template< typename T >
KOKKOS_INLINE_FUNCTION
T atomic_exchange( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
, const T & >::type val )
{
union U {
Impl::cas128_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
newval.t = val;
do {
assume.i = oldval.i ;
oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
#endif
//----------------------------------------------------------------------------
template < typename T >
inline
T atomic_exchange( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM)
&& ( sizeof(T) != 16 )
#endif
, const T >::type& val )
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
const T tmp = *dest = val;
#ifndef KOKKOS_COMPILER_CLANG
(void) tmp;
#endif
Impl::unlock_address_host_space( (void*) dest );
return return_val;
}
template< typename T >
KOKKOS_INLINE_FUNCTION
void atomic_assign( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
, const T & >::type val )
{
typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
const type v = *((type*)&val); // Extract to be sure the value doesn't change
type assumed ;
#ifdef KOKKOS_HAVE_CXX11
union U {
T val_T ;
type val_type ;
KOKKOS_INLINE_FUNCTION U() {};
} old ;
#else
union { T val_T ; type val_type ; } old ;
#endif
old.val_T = *dest ;
do {
assumed = old.val_type ;
old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
} while ( assumed != old.val_type );
}
#ifdef KOKKOS_ENABLE_ASM
template< typename T >
KOKKOS_INLINE_FUNCTION
void atomic_assign( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
, const T & >::type val )
{
union U {
Impl::cas128_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
newval.t = val;
do {
assume.i = oldval.i ;
oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
} while ( assume.i != oldval.i );
}
#endif
template < typename T >
inline
void atomic_assign( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM)
&& ( sizeof(T) != 16 )
#endif
, const T >::type& val )
{
while( !Impl::lock_address_host_space( (void*) dest ) );
*dest = val;
Impl::unlock_address_host_space( (void*) dest );
}
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_exchange( volatile T * const dest , const T val )
{
T retval;
//#pragma omp atomic capture
#pragma omp critical
{
retval = dest[0];
dest[0] = val;
}
return retval;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
void atomic_assign( volatile T * const dest , const T val )
{
//#pragma omp atomic
#pragma omp critical
{
dest[0] = val;
}
}
#endif
} // namespace Kokkos
#endif
//----------------------------------------------------------------------------

View File

@ -0,0 +1,326 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_ATOMICS_USE_CUDA )
// Support for int, unsigned int, unsigned long long int, and float
__inline__ __device__
int atomic_fetch_add( volatile int * const dest , const int val )
{ return atomicAdd((int*)dest,val); }
__inline__ __device__
unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
{ return atomicAdd((unsigned int*)dest,val); }
__inline__ __device__
unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
const unsigned long long int val )
{ return atomicAdd((unsigned long long int*)dest,val); }
__inline__ __device__
float atomic_fetch_add( volatile float * const dest , const float val )
{ return atomicAdd((float*)dest,val); }
template < typename T >
__inline__ __device__
T atomic_fetch_add( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
#else
union U {
int i ;
T t ;
} assume , oldval , newval ;
#endif
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i );
return oldval.t ;
}
template < typename T >
__inline__ __device__
T atomic_fetch_add( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
unsigned long long int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
#else
union U {
unsigned long long int i ;
T t ;
} assume , oldval , newval ;
#endif
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
//----------------------------------------------------------------------------
template < typename T >
__inline__ __device__
T atomic_fetch_add( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
return_val = *dest;
*dest = return_val + val;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
}
//----------------------------------------------------------------------------
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
KOKKOS_INLINE_FUNCTION
int atomic_fetch_add( volatile int * const dest , const int val )
{ return __sync_fetch_and_add(dest,val); }
KOKKOS_INLINE_FUNCTION
long int atomic_fetch_add( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_add(dest,val); }
#if defined( KOKKOS_ATOMICS_USE_GCC )
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_add(dest,val); }
KOKKOS_INLINE_FUNCTION
unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_add(dest,val); }
#endif
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
int i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
#else
union U {
int i ;
T t ;
} assume , oldval , newval ;
#endif
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(long) , const T >::type val )
{
#ifdef KOKKOS_HAVE_CXX11
union U {
long i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
#else
union U {
long i ;
T t ;
} assume , oldval , newval ;
#endif
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
#ifdef KOKKOS_ENABLE_ASM
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_add( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) != sizeof(long) &&
sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
{
union U {
Impl::cas128_t i ;
T t ;
KOKKOS_INLINE_FUNCTION U() {};
} assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t + val ;
oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
#endif
//----------------------------------------------------------------------------
template < typename T >
inline
T atomic_fetch_add( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM)
&& ( sizeof(T) != 16 )
#endif
, const T >::type& val )
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
const T tmp = *dest = return_val + val;
#ifndef KOKKOS_COMPILER_CLANG
(void) tmp;
#endif
Impl::unlock_address_host_space( (void*) dest );
return return_val;
}
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template< typename T >
T atomic_fetch_add( volatile T * const dest , const T val )
{
T retval;
#pragma omp atomic capture
{
retval = dest[0];
dest[0] += val;
}
return retval;
}
#endif
//----------------------------------------------------------------------------
// Simpler version of atomic_fetch_add without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION
void atomic_add(volatile T * const dest, const T src) {
atomic_fetch_add(dest,src);
}
// Atomic increment
template<typename T>
KOKKOS_INLINE_FUNCTION
void atomic_increment(volatile T* a) {
Kokkos::atomic_fetch_add(a,1);
}
template<typename T>
KOKKOS_INLINE_FUNCTION
void atomic_decrement(volatile T* a) {
Kokkos::atomic_fetch_add(a,-1);
}
}
#endif

View File

@ -0,0 +1,125 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
#define KOKKOS_ATOMIC_FETCH_AND_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_ATOMICS_USE_CUDA )
// Support for int, unsigned int, unsigned long long int, and float
__inline__ __device__
int atomic_fetch_and( volatile int * const dest , const int val )
{ return atomicAnd((int*)dest,val); }
__inline__ __device__
unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
{ return atomicAnd((unsigned int*)dest,val); }
#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
__inline__ __device__
unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
const unsigned long long int val )
{ return atomicAnd((unsigned long long int*)dest,val); }
#endif
//----------------------------------------------------------------------------
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
KOKKOS_INLINE_FUNCTION
int atomic_fetch_and( volatile int * const dest , const int val )
{ return __sync_fetch_and_and(dest,val); }
KOKKOS_INLINE_FUNCTION
long int atomic_fetch_and( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_and(dest,val); }
#if defined( KOKKOS_ATOMICS_USE_GCC )
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_and(dest,val); }
KOKKOS_INLINE_FUNCTION
unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_and(dest,val); }
#endif
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template< typename T >
T atomic_fetch_and( volatile T * const dest , const T val )
{
T retval;
#pragma omp atomic capture
{
retval = dest[0];
dest[0] &= val;
}
return retval;
}
#endif
//----------------------------------------------------------------------------
// Simpler version of atomic_fetch_and without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION
void atomic_and(volatile T * const dest, const T src) {
(void)atomic_fetch_and(dest,src);
}
}
#endif

View File

@ -0,0 +1,125 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
#define KOKKOS_ATOMIC_FETCH_OR_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_ATOMICS_USE_CUDA )
// Support for int, unsigned int, unsigned long long int, and float
__inline__ __device__
int atomic_fetch_or( volatile int * const dest , const int val )
{ return atomicOr((int*)dest,val); }
__inline__ __device__
unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
{ return atomicOr((unsigned int*)dest,val); }
#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
__inline__ __device__
unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
const unsigned long long int val )
{ return atomicOr((unsigned long long int*)dest,val); }
#endif
//----------------------------------------------------------------------------
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
KOKKOS_INLINE_FUNCTION
int atomic_fetch_or( volatile int * const dest , const int val )
{ return __sync_fetch_and_or(dest,val); }
KOKKOS_INLINE_FUNCTION
long int atomic_fetch_or( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_or(dest,val); }
#if defined( KOKKOS_ATOMICS_USE_GCC )
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_or(dest,val); }
KOKKOS_INLINE_FUNCTION
unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_or(dest,val); }
#endif
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template< typename T >
T atomic_fetch_or( volatile T * const dest , const T val )
{
T retval;
#pragma omp atomic capture
{
retval = dest[0];
dest[0] |= val;
}
return retval;
}
#endif
//----------------------------------------------------------------------------
// Simpler version of atomic_fetch_or without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION
void atomic_or(volatile T * const dest, const T src) {
(void)atomic_fetch_or(dest,src);
}
}
#endif

View File

@ -0,0 +1,233 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
namespace Kokkos {
//----------------------------------------------------------------------------
#if defined( KOKKOS_ATOMICS_USE_CUDA )
// Support for int, unsigned int, unsigned long long int, and float
__inline__ __device__
int atomic_fetch_sub( volatile int * const dest , const int val )
{ return atomicSub((int*)dest,val); }
__inline__ __device__
unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
{ return atomicSub((unsigned int*)dest,val); }
template < typename T >
__inline__ __device__
T atomic_fetch_sub( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
{
union { int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i );
return oldval.t ;
}
template < typename T >
__inline__ __device__
T atomic_fetch_sub( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
{
union { unsigned long long int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
//----------------------------------------------------------------------------
template < typename T >
__inline__ __device__
T atomic_fetch_sub( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
T return_val;
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
return_val = *dest;
*dest = return_val - val;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
}
//----------------------------------------------------------------------------
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
KOKKOS_INLINE_FUNCTION
int atomic_fetch_sub( volatile int * const dest , const int val )
{ return __sync_fetch_and_sub(dest,val); }
KOKKOS_INLINE_FUNCTION
long int atomic_fetch_sub( volatile long int * const dest , const long int val )
{ return __sync_fetch_and_sub(dest,val); }
#if defined( KOKKOS_ATOMICS_USE_GCC )
KOKKOS_INLINE_FUNCTION
unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
{ return __sync_fetch_and_sub(dest,val); }
KOKKOS_INLINE_FUNCTION
unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
{ return __sync_fetch_and_sub(dest,val); }
#endif
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_sub( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
{
union { int i ; T t ; } assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_sub( volatile T * const dest ,
typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(long) , const T >::type val )
{
union { long i ; T t ; } assume , oldval , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = assume.t - val ;
oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
//----------------------------------------------------------------------------
template < typename T >
inline
T atomic_fetch_sub( volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
, const T >::type& val )
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
*dest = return_val - val;
Impl::unlock_address_host_space( (void*) dest );
return return_val;
}
//----------------------------------------------------------------------------
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
template< typename T >
T atomic_fetch_sub( volatile T * const dest , const T val )
{
T retval;
#pragma omp atomic capture
{
retval = dest[0];
dest[0] -= val;
}
return retval;
}
#endif
// Simpler version of atomic_fetch_sub without the fetch
template <typename T>
KOKKOS_INLINE_FUNCTION
void atomic_sub(volatile T * const dest, const T src) {
atomic_fetch_sub(dest,src);
}
}
#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
#endif

View File

@ -0,0 +1,375 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
#define KOKKOS_ATOMIC_GENERIC_HPP
#include <Kokkos_Macros.hpp>
// Combination operands to be used in an Compare and Exchange based atomic operation
namespace Kokkos {
namespace Impl {
template<class Scalar1, class Scalar2>
struct AddOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1+val2;
}
};
template<class Scalar1, class Scalar2>
struct SubOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1-val2;
}
};
template<class Scalar1, class Scalar2>
struct MulOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1*val2;
}
};
template<class Scalar1, class Scalar2>
struct DivOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1/val2;
}
};
template<class Scalar1, class Scalar2>
struct ModOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1%val2;
}
};
template<class Scalar1, class Scalar2>
struct AndOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1&val2;
}
};
template<class Scalar1, class Scalar2>
struct OrOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1|val2;
}
};
template<class Scalar1, class Scalar2>
struct XorOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1^val2;
}
};
template<class Scalar1, class Scalar2>
struct LShiftOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1<<val2;
}
};
template<class Scalar1, class Scalar2>
struct RShiftOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return val1>>val2;
}
};
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
{
union { unsigned long long int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = Oper::apply(assume.t, val) ;
oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
{
union { unsigned long long int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = Oper::apply(assume.t, val) ;
oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return newval.t ;
}
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
{
union { int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = Oper::apply(assume.t, val) ;
oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return oldval.t ;
}
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
{
union { int i ; T t ; } oldval , assume , newval ;
oldval.t = *dest ;
do {
assume.i = oldval.i ;
newval.t = Oper::apply(assume.t, val) ;
oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
} while ( assume.i != oldval.i );
return newval.t ;
}
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
&& ( sizeof(T) != 16 )
#endif
, const T >::type val )
{
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
*dest = Oper::apply(return_val, val);
Impl::unlock_address_host_space( (void*) dest );
return return_val;
#else
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
T return_val = *dest;
*dest = Oper::apply(return_val, val);;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
#endif
}
template < class Oper, typename T >
KOKKOS_INLINE_FUNCTION
T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
typename ::Kokkos::Impl::enable_if<
( sizeof(T) != 4 )
&& ( sizeof(T) != 8 )
#if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
&& ( sizeof(T) != 16 )
#endif
, const T >::type& val )
{
#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = Oper::apply(*dest, val);
*dest = return_val;
Impl::unlock_address_host_space( (void*) dest );
return return_val;
#else
// This is a way to (hopefully) avoid dead lock in a warp
bool done = false;
while (! done ) {
if( Impl::lock_address_cuda_space( (void*) dest ) ) {
T return_val = Oper::apply(*dest, val);
*dest = return_val;
Impl::unlock_address_cuda_space( (void*) dest );
}
}
return return_val;
#endif
}
}
}
namespace Kokkos {
// Fetch_Oper atomics: return value before operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_mul(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_div(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_mod(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_and(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_or(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_xor(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
}
// Oper Fetch atomics: return value after operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_mul_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_div_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_mod_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_and_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_or_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_xor_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
}
}
#endif

View File

@ -0,0 +1,462 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ATOMIC_VIEW_HPP
#define KOKKOS_ATOMIC_VIEW_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_Atomic.hpp>
namespace Kokkos { namespace Impl {
class AllocationTracker;
//The following tag is used to prevent an implicit call of the constructor when trying
//to assign a literal 0 int ( = 0 );
struct AtomicViewConstTag {};
template<class ViewTraits>
class AtomicDataElement {
public:
typedef typename ViewTraits::value_type value_type;
typedef typename ViewTraits::const_value_type const_value_type;
typedef typename ViewTraits::non_const_value_type non_const_value_type;
volatile value_type* const ptr;
KOKKOS_INLINE_FUNCTION
AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
KOKKOS_INLINE_FUNCTION
const_value_type operator = (const_value_type& val) const {
*ptr = val;
return val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator = (volatile const_value_type& val) const {
*ptr = val;
return val;
}
KOKKOS_INLINE_FUNCTION
void inc() const {
Kokkos::atomic_increment(ptr);
}
KOKKOS_INLINE_FUNCTION
void dec() const {
Kokkos::atomic_decrement(ptr);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ++ () const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
return tmp+1;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator -- () const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
return tmp-1;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ++ (int) const {
return Kokkos::atomic_fetch_add(ptr,1);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator -- (int) const {
return Kokkos::atomic_fetch_add(ptr,-1);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator += (const_value_type& val) const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
return tmp+val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator += (volatile const_value_type& val) const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
return tmp+val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator -= (const_value_type& val) const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
return tmp-val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator -= (volatile const_value_type& val) const {
const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
return tmp-val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator *= (const_value_type& val) const {
return Kokkos::atomic_mul_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator *= (volatile const_value_type& val) const {
return Kokkos::atomic_mul_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator /= (const_value_type& val) const {
return Kokkos::atomic_div_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator /= (volatile const_value_type& val) const {
return Kokkos::atomic_div_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator %= (const_value_type& val) const {
return Kokkos::atomic_mod_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator %= (volatile const_value_type& val) const {
return Kokkos::atomic_mod_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator &= (const_value_type& val) const {
return Kokkos::atomic_and_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator &= (volatile const_value_type& val) const {
return Kokkos::atomic_and_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ^= (const_value_type& val) const {
return Kokkos::atomic_xor_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ^= (volatile const_value_type& val) const {
return Kokkos::atomic_xor_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator |= (const_value_type& val) const {
return Kokkos::atomic_or_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator |= (volatile const_value_type& val) const {
return Kokkos::atomic_or_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator <<= (const_value_type& val) const {
return Kokkos::atomic_lshift_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator <<= (volatile const_value_type& val) const {
return Kokkos::atomic_lshift_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator >>= (const_value_type& val) const {
return Kokkos::atomic_rshift_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator >>= (volatile const_value_type& val) const {
return Kokkos::atomic_rshift_fetch(ptr,val);
}
KOKKOS_INLINE_FUNCTION
const_value_type operator + (const_value_type& val) const {
return *ptr+val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator + (volatile const_value_type& val) const {
return *ptr+val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator - (const_value_type& val) const {
return *ptr-val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator - (volatile const_value_type& val) const {
return *ptr-val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator * (const_value_type& val) const {
return *ptr*val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator * (volatile const_value_type& val) const {
return *ptr*val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator / (const_value_type& val) const {
return *ptr/val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator / (volatile const_value_type& val) const {
return *ptr/val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator % (const_value_type& val) const {
return *ptr^val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator % (volatile const_value_type& val) const {
return *ptr^val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ! () const {
return !*ptr;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator && (const_value_type& val) const {
return *ptr&&val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator && (volatile const_value_type& val) const {
return *ptr&&val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator || (const_value_type& val) const {
return *ptr|val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator || (volatile const_value_type& val) const {
return *ptr|val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator & (const_value_type& val) const {
return *ptr&val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator & (volatile const_value_type& val) const {
return *ptr&val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator | (const_value_type& val) const {
return *ptr|val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator | (volatile const_value_type& val) const {
return *ptr|val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ^ (const_value_type& val) const {
return *ptr^val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ^ (volatile const_value_type& val) const {
return *ptr^val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator ~ () const {
return ~*ptr;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator << (const unsigned int& val) const {
return *ptr<<val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator << (volatile const unsigned int& val) const {
return *ptr<<val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator >> (const unsigned int& val) const {
return *ptr>>val;
}
KOKKOS_INLINE_FUNCTION
const_value_type operator >> (volatile const unsigned int& val) const {
return *ptr>>val;
}
KOKKOS_INLINE_FUNCTION
bool operator == (const_value_type& val) const {
return *ptr == val;
}
KOKKOS_INLINE_FUNCTION
bool operator == (volatile const_value_type& val) const {
return *ptr == val;
}
KOKKOS_INLINE_FUNCTION
bool operator != (const_value_type& val) const {
return *ptr != val;
}
KOKKOS_INLINE_FUNCTION
bool operator != (volatile const_value_type& val) const {
return *ptr != val;
}
KOKKOS_INLINE_FUNCTION
bool operator >= (const_value_type& val) const {
return *ptr >= val;
}
KOKKOS_INLINE_FUNCTION
bool operator >= (volatile const_value_type& val) const {
return *ptr >= val;
}
KOKKOS_INLINE_FUNCTION
bool operator <= (const_value_type& val) const {
return *ptr <= val;
}
KOKKOS_INLINE_FUNCTION
bool operator <= (volatile const_value_type& val) const {
return *ptr <= val;
}
KOKKOS_INLINE_FUNCTION
bool operator < (const_value_type& val) const {
return *ptr < val;
}
KOKKOS_INLINE_FUNCTION
bool operator < (volatile const_value_type& val) const {
return *ptr < val;
}
KOKKOS_INLINE_FUNCTION
bool operator > (const_value_type& val) const {
return *ptr > val;
}
KOKKOS_INLINE_FUNCTION
bool operator > (volatile const_value_type& val) const {
return *ptr > val;
}
KOKKOS_INLINE_FUNCTION
operator const_value_type () const {
//return Kokkos::atomic_load(ptr);
return *ptr;
}
KOKKOS_INLINE_FUNCTION
operator volatile non_const_value_type () volatile const {
//return Kokkos::atomic_load(ptr);
return *ptr;
}
};
template<class ViewTraits>
class AtomicViewDataHandle {
public:
typename ViewTraits::value_type* ptr;
KOKKOS_INLINE_FUNCTION
AtomicViewDataHandle()
: ptr(NULL)
{}
KOKKOS_INLINE_FUNCTION
AtomicViewDataHandle(typename ViewTraits::value_type* ptr_)
:ptr(ptr_)
{}
template<class iType>
KOKKOS_INLINE_FUNCTION
AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
}
KOKKOS_INLINE_FUNCTION
operator typename ViewTraits::value_type * () const { return ptr ; }
};
template<unsigned Size>
struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
template<>
struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
typedef int type;
};
template<>
struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
typedef int64_t type;
};
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
template<class ViewTraits>
class ViewDataHandle<
ViewTraits ,
typename enable_if<
( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
( ViewTraits::memory_traits::Atomic )
>::type >
{
private:
// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
// (sizeof(typename ViewTraits::const_value_type)==8),
// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
// atomic_view_possible;
typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
typedef ViewDataHandle self_type;
public:
enum { ReturnTypeIsReference = false };
typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
typedef Impl::AtomicDataElement<ViewTraits> return_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
{
return handle_type(arg_data_ptr);
}
};
}} // namespace Kokkos::Impl
#endif

View File

@ -0,0 +1,211 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
#define KOKKOS_ATOMIC_WINDOWS_HPP
#ifdef _WIN32
#define NOMINMAX
#include <Windows.h>
namespace Kokkos {
namespace Impl {
_declspec(align(16))
struct cas128_t
{
LONGLONG lower;
LONGLONG upper;
KOKKOS_INLINE_FUNCTION
bool operator != (const cas128_t& a) const {
return (lower != a.lower) || upper != a.upper;
}
};
}
#ifdef KOKKOS_HAVE_CXX11
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val)
{
union U {
LONG i;
T t;
KOKKOS_INLINE_FUNCTION U() {};
} tmp;
tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare));
return tmp.t;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val)
{
union U {
LONGLONG i;
T t;
KOKKOS_INLINE_FUNCTION U() {};
} tmp;
tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare));
return tmp.t;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T * const dest, const T & compare,
typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val)
{
union U {
Impl::cas128_t i;
T t;
KOKKOS_INLINE_FUNCTION U() {};
} tmp, newval;
newval.t = val;
tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare));
return tmp.t;
}
template< typename T >
T atomic_fetch_or(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = val | oldval;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
return oldval;
}
template< typename T >
T atomic_fetch_and(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = val & oldval;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
return oldval;
}
template< typename T >
T atomic_fetch_add(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = val + oldval;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
return oldval;
}
template< typename T >
T atomic_fetch_exchange(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
assume = oldval;
oldval = atomic_compare_exchange(dest, assume, val);
} while (assume != oldval);
return oldval;
}
template< typename T >
void atomic_or(volatile T * const dest, const T val) {
atomic_fetch_or(dest, val);
}
template< typename T >
void atomic_and(volatile T * const dest, const T val) {
atomic_fetch_and(dest, val);
}
template< typename T >
void atomic_add(volatile T * const dest, const T val) {
atomic_fetch_add(dest, val);
}
template< typename T >
void atomic_exchange(volatile T * const dest, const T val) {
atomic_fetch_exchange(dest, val);
}
template< typename T >
void atomic_assign(volatile T * const dest, const T val) {
atomic_fetch_exchange(dest, val);
}
template< typename T >
T atomic_increment(volatile T * const dest) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = assume++;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
}
template< typename T >
T atomic_decrement(volatile T * const dest) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = assume--;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
}
}
#endif
#endif
#endif

View File

@ -0,0 +1,281 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
void* MallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
if (size) {
ptr = malloc(size);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
if (ptr) {
free(ptr);
}
}
void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
{
void * ptr = realloc(old_ptr, new_size);
if (new_size > 0u && ptr == NULL) {
throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
}
return ptr;
}
/*--------------------------------------------------------------------------*/
namespace {
void * raw_aligned_allocate( size_t size, size_t alignment )
{
void * ptr = NULL;
if ( size ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
ptr = _mm_malloc( size , alignment );
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
posix_memalign( & ptr, alignment , size );
#else
// Over-allocate to and round up to guarantee proper alignment.
size_t size_padded = size + alignment + sizeof(void *);
void * alloc_ptr = malloc( size_padded );
if (alloc_ptr) {
uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
// offset enough to record the alloc_ptr
address += sizeof(void *);
uintptr_t rem = address % alignment;
uintptr_t offset = rem ? (alignment - rem) : 0u;
address += offset;
ptr = reinterpret_cast<void *>(address);
// record the alloc'd pointer
address -= sizeof(void *);
*reinterpret_cast<void **>(address) = alloc_ptr;
}
#endif
}
return ptr;
}
void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
{
if ( ptr ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
_mm_free( ptr );
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
free( ptr );
#else
// get the alloc'd pointer
void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
free( alloc_ptr );
#endif
}
}
}
void* AlignedAllocator::allocate( size_t size )
{
void * ptr = 0 ;
if ( size ) {
ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void AlignedAllocator::deallocate( void * ptr, size_t size )
{
raw_aligned_deallocate( ptr, size);
}
void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;;
if (old_size < new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, old_size );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
// mmap flags for private anonymous memory allocation
#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
#else
#define NO_MMAP
#endif
// huge page tables
#if !defined( NO_MMAP )
#if defined( MAP_HUGETLB )
#define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
#elif defined( MMAP_FLAGS )
#define MMAP_FLAGS_HUGE MMAP_FLAGS
#endif
// threshold to use huge pages
#define MMAP_USE_HUGE_PAGES (1u << 27)
#endif
// read write access to private memory
#if !defined( NO_MMAP )
#define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
#endif
void* PageAlignedAllocator::allocate( size_t size )
{
void *ptr = NULL;
if (size) {
#if !defined NO_MMAP
if ( size < MMAP_USE_HUGE_PAGES ) {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
} else {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
}
if (ptr == MAP_FAILED) {
ptr = NULL;
}
#else
static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
ptr = raw_aligned_allocate( size, page_size);
#endif
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void PageAlignedAllocator::deallocate( void * ptr, size_t size )
{
#if !defined( NO_MMAP )
munmap(ptr, size);
#else
raw_aligned_deallocate(ptr, size);
#endif
}
void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = NULL;
#if defined( NO_MMAP ) || defined( __APPLE__ )
if (old_size != new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
deallocate( old_ptr, old_size );
}
else {
ptr = old_ptr;
}
#else
ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
if (ptr == MAP_FAILED) {
throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
}
#endif
return ptr;
}
}} // namespace Kokkos::Impl

View File

@ -0,0 +1,118 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
#define KOKKOS_BASIC_ALLOCATORS_HPP
namespace Kokkos { namespace Impl {
/// class UnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
class UnmanagedAllocator
{
public:
static const char * name() { return "Unmanaged Allocator"; }
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
};
/// class MallocAllocator
class MallocAllocator
{
public:
static const char * name()
{
return "Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
/// class AlignedAllocator
/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
class AlignedAllocator
{
public:
static const char * name()
{
return "Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
/// class PageAlignedAllocator
/// memory aligned to PAGE_SIZE
class PageAlignedAllocator
{
public:
static const char * name()
{
return "Page Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_BASIC_ALLOCATORS_HPP

View File

@ -0,0 +1,447 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <cctype>
#include <cstring>
#include <iostream>
#include <cstdlib>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
bool is_unsigned_int(const char* str)
{
const size_t len = strlen (str);
for (size_t i = 0; i < len; ++i) {
if (! isdigit (str[i])) {
return false;
}
}
return true;
}
void initialize_internal(const InitArguments& args)
{
// Protect declarations, to prevent "unused variable" warnings.
#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
const int num_threads = args.num_threads;
const int use_numa = args.num_numa;
#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
#if defined( KOKKOS_HAVE_CUDA )
const int use_gpu = args.device_id;
#endif // defined( KOKKOS_HAVE_CUDA )
#if defined( KOKKOS_HAVE_OPENMP )
if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
if(num_threads>0) {
if(use_numa>0) {
Kokkos::OpenMP::initialize(num_threads,use_numa);
}
else {
Kokkos::OpenMP::initialize(num_threads);
}
} else {
Kokkos::OpenMP::initialize();
}
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
}
else {
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
}
#endif
#if defined( KOKKOS_HAVE_PTHREAD )
if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
if(num_threads>0) {
if(use_numa>0) {
Kokkos::Threads::initialize(num_threads,use_numa);
}
else {
Kokkos::Threads::initialize(num_threads);
}
} else {
Kokkos::Threads::initialize();
}
//std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
}
else {
//std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
}
#endif
#if defined( KOKKOS_HAVE_SERIAL )
// Prevent "unused variable" warning for 'args' input struct. If
// Serial::initialize() ever needs to take arguments from the input
// struct, you may remove this line of code.
(void) args;
if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
Kokkos::Serial::initialize();
}
#endif
#if defined( KOKKOS_HAVE_CUDA )
if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
if (use_gpu > -1) {
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
}
else {
Kokkos::Cuda::initialize();
}
//std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
}
#endif
#ifdef KOKKOSP_ENABLE_PROFILING
Kokkos::Experimental::initialize();
#endif
}
void finalize_internal( const bool all_spaces = false )
{
#if defined( KOKKOS_HAVE_CUDA )
if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
if(Kokkos::Cuda::is_initialized())
Kokkos::Cuda::finalize();
}
#endif
#if defined( KOKKOS_HAVE_OPENMP )
if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
all_spaces ) {
if(Kokkos::OpenMP::is_initialized())
Kokkos::OpenMP::finalize();
}
#endif
#if defined( KOKKOS_HAVE_PTHREAD )
if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
all_spaces ) {
if(Kokkos::Threads::is_initialized())
Kokkos::Threads::finalize();
}
#endif
#if defined( KOKKOS_HAVE_SERIAL )
if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ||
all_spaces ) {
if(Kokkos::Serial::is_initialized())
Kokkos::Serial::finalize();
}
#endif
#ifdef KOKKOSP_ENABLE_PROFILING
Kokkos::Experimental::finalize();
#endif
}
void fence_internal()
{
#if defined( KOKKOS_HAVE_CUDA )
if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
Kokkos::Cuda::fence();
}
#endif
#if defined( KOKKOS_HAVE_OPENMP )
if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
Kokkos::OpenMP::fence();
}
#endif
#if defined( KOKKOS_HAVE_PTHREAD )
if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
Kokkos::Threads::fence();
}
#endif
#if defined( KOKKOS_HAVE_SERIAL )
if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
Kokkos::Serial::fence();
}
#endif
}
} // namespace
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
void initialize(int& narg, char* arg[])
{
int num_threads = -1;
int numa = -1;
int device = -1;
int kokkos_threads_found = 0;
int kokkos_numa_found = 0;
int kokkos_device_found = 0;
int kokkos_ndevices_found = 0;
int iarg = 0;
while (iarg < narg) {
if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
//Find the number of threads (expecting --threads=XX)
if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
num_threads = atoi(number);
//Remove the --kokkos-threads argument from the list but leave --threads
if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_threads_found=1;
narg--;
} else {
iarg++;
}
} else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
//Find the number of numa (expecting --numa=XX)
if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
numa = atoi(number);
//Remove the --kokkos-numa argument from the list but leave --numa
if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_numa_found=1;
narg--;
} else {
iarg++;
}
} else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
//Find the number of device (expecting --device=XX)
if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
device = atoi(number);
//Remove the --kokkos-device argument from the list but leave --device
if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_device_found=1;
narg--;
} else {
iarg++;
}
} else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
//Find the number of device (expecting --device=XX)
if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
int ndevices=-1;
int skip_device = 9999;
char* num1 = strchr(arg[iarg],'=')+1;
char* num2 = strpbrk(num1,",");
int num1_len = num2==NULL?strlen(num1):num2-num1;
char* num1_only = new char[num1_len+1];
strncpy(num1_only,num1,num1_len);
num1_only[num1_len]=0;
if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
}
if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
ndevices = atoi(num1_only);
if( num2 != NULL ) {
if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
skip_device = atoi(num2+1);
}
if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
char *str;
if ((str = getenv("SLURM_LOCALID"))) {
int local_rank = atoi(str);
device = local_rank % ndevices;
if (device >= skip_device) device++;
}
if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
int local_rank = atoi(str);
device = local_rank % ndevices;
if (device >= skip_device) device++;
}
if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
int local_rank = atoi(str);
device = local_rank % ndevices;
if (device >= skip_device) device++;
}
if(device==-1) {
device = 0;
if (device >= skip_device) device++;
}
}
//Remove the --kokkos-ndevices argument from the list but leave --ndevices
if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_ndevices_found=1;
narg--;
} else {
iarg++;
}
} else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
std::cout << std::endl;
std::cout << "--------------------------------------------------------------------------------" << std::endl;
std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
std::cout << "--------------------------------------------------------------------------------" << std::endl;
std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl;
std::cout << "settings." << std::endl;
std::cout << std::endl;
std::cout << "--kokkos-help : print this message" << std::endl;
std::cout << "--kokkos-threads=INT : specify total number of threads or" << std::endl;
std::cout << " number of threads per NUMA region if " << std::endl;
std::cout << " used in conjunction with '--numa' option. " << std::endl;
std::cout << "--kokkos-numa=INT : specify number of NUMA regions used by process." << std::endl;
std::cout << "--kokkos-device=INT : specify device id to be used by Kokkos. " << std::endl;
std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
std::cout << " devices per node to be used. Process to device" << std::endl;
std::cout << " mapping happens by obtaining the local MPI rank" << std::endl;
std::cout << " and assigning devices round-robin. The optional" << std::endl;
std::cout << " second argument allows for an existing device" << std::endl;
std::cout << " to be ignored. This is most useful on workstations" << std::endl;
std::cout << " with multiple GPUs of which one is used to drive" << std::endl;
std::cout << " screen output." << std::endl;
std::cout << std::endl;
std::cout << "--------------------------------------------------------------------------------" << std::endl;
std::cout << std::endl;
//Remove the --kokkos-help argument from the list but leave --ndevices
if(strcmp(arg[iarg],"--kokkos-help") == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
narg--;
} else {
iarg++;
}
} else
iarg++;
}
InitArguments arguments;
arguments.num_threads = num_threads;
arguments.num_numa = numa;
arguments.device_id = device;
Impl::initialize_internal(arguments);
}
void initialize(const InitArguments& arguments) {
Impl::initialize_internal(arguments);
}
void finalize()
{
Impl::finalize_internal();
}
void finalize_all()
{
enum { all_spaces = true };
Impl::finalize_internal( all_spaces );
}
void fence()
{
Impl::fence_internal();
}
} // namespace Kokkos

View File

@ -0,0 +1,193 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <ostream>
#include <sstream>
#include <iomanip>
#include <stdexcept>
#include <impl/Kokkos_Error.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void host_abort( const char * const message )
{
fwrite(message,1,strlen(message),stderr);
fflush(stderr);
abort();
}
void throw_runtime_exception( const std::string & msg )
{
std::ostringstream o ;
o << msg ;
traceback_callstack( o );
throw std::runtime_error( o.str() );
}
std::string human_memory_size(size_t arg_bytes)
{
double bytes = arg_bytes;
const double K = 1024;
const double M = K*1024;
const double G = M*1024;
std::ostringstream out;
if (bytes < K) {
out << std::setprecision(4) << bytes << " B";
} else if (bytes < M) {
bytes /= K;
out << std::setprecision(4) << bytes << " K";
} else if (bytes < G) {
bytes /= M;
out << std::setprecision(4) << bytes << " M";
} else {
bytes /= G;
out << std::setprecision(4) << bytes << " G";
}
return out.str();
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
/* This is only known to work with GNU C++
* Must be compiled with '-rdynamic'
* Must be linked with '-ldl'
*/
/* Print call stack into an error stream,
* so one knows in which function the error occured.
*
* Code copied from:
* http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
*
* License on this site:
* This blog is licensed under a
* Creative Commons Attribution-Share Alike 3.0 Unported License.
*
* http://creativecommons.org/licenses/by-sa/3.0/
*
* Modified to output to std::ostream.
*/
#include <signal.h>
#include <execinfo.h>
#include <cxxabi.h>
#include <dlfcn.h>
#include <stdlib.h>
namespace Kokkos {
namespace Impl {
void traceback_callstack( std::ostream & msg )
{
using namespace abi;
enum { MAX_DEPTH = 32 };
void *trace[MAX_DEPTH];
Dl_info dlinfo;
int status;
int trace_size = backtrace(trace, MAX_DEPTH);
msg << std::endl << "Call stack {" << std::endl ;
for (int i=1; i<trace_size; ++i)
{
if(!dladdr(trace[i], &dlinfo))
continue;
const char * symname = dlinfo.dli_sname;
char * demangled = __cxa_demangle(symname, NULL, 0, &status);
if ( status == 0 && demangled ) {
symname = demangled;
}
if ( symname && *symname != 0 ) {
msg << " object: " << dlinfo.dli_fname
<< " function: " << symname
<< std::endl ;
}
if ( demangled ) {
free(demangled);
}
}
msg << "}" ;
}
}
}
#else
namespace Kokkos {
namespace Impl {
void traceback_callstack( std::ostream & msg )
{
msg << std::endl << "Traceback functionality not available" << std::endl ;
}
}
}
#endif

View File

@ -0,0 +1,78 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_ERROR_HPP
#define KOKKOS_IMPL_ERROR_HPP
#include <string>
#include <iosfwd>
namespace Kokkos {
namespace Impl {
void host_abort( const char * const );
void throw_runtime_exception( const std::string & );
void traceback_callstack( std::ostream & );
std::string human_memory_size(size_t arg_bytes);
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
namespace Kokkos {
inline
void abort( const char * const message ) { Kokkos::Impl::host_abort(message); }
}
#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,455 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
/*--------------------------------------------------------------------------*/
#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_HAVE_CUDA )
// Intel specialized allocator does not interoperate with CUDA memory allocation
#define KOKKOS_INTEL_MM_ALLOC_AVAILABLE
#endif
/*--------------------------------------------------------------------------*/
#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
#define KOKKOS_POSIX_MEMALIGN_AVAILABLE
#include <unistd.h>
#include <sys/mman.h>
/* mmap flags for private anonymous memory allocation */
#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
#define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
#elif defined( MAP_ANON ) && defined( MAP_PRIVATE )
#define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
#endif
// mmap flags for huge page tables
#if defined( KOKKOS_POSIX_MMAP_FLAGS )
#if defined( MAP_HUGETLB )
#define KOKKOS_POSIX_MMAP_FLAGS_HUGE (KOKKOS_POSIX_MMAP_FLAGS | MAP_HUGETLB )
#else
#define KOKKOS_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS
#endif
#endif
#endif
/*--------------------------------------------------------------------------*/
#include <stddef.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
#include <iostream>
#include <sstream>
#include <cstring>
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Kokkos_Atomic.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
{
memcpy( dst , src , n );
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace {
static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
typedef int (* QuerySpaceInParallelPtr )();
QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
int s_in_parallel_query_count = 0 ;
} // namespace <empty>
void HostSpace::register_in_parallel( int (*device_in_parallel)() )
{
if ( 0 == device_in_parallel ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
}
int i = -1 ;
if ( ! (device_in_parallel)() ) {
for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
}
if ( i < s_in_parallel_query_count ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
}
if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
}
for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
if ( i == s_in_parallel_query_count ) {
s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
}
}
int HostSpace::in_parallel()
{
const int n = s_in_parallel_query_count ;
int i = 0 ;
while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
return i < n ;
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label );
}
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/* Default allocation mechanism */
HostSpace::HostSpace()
: m_alloc_mech(
#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
HostSpace::INTEL_MM_ALLOC
#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
HostSpace::POSIX_MMAP
#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
HostSpace::POSIX_MEMALIGN
#else
HostSpace::STD_MALLOC
#endif
)
{}
/* Default allocation mechanism */
HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech )
: m_alloc_mech( HostSpace::STD_MALLOC )
{
if ( arg_alloc_mech == STD_MALLOC ) {
m_alloc_mech = HostSpace::STD_MALLOC ;
}
#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) {
m_alloc_mech = HostSpace::INTEL_MM_ALLOC ;
}
#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) {
m_alloc_mech = HostSpace::POSIX_MEMALIGN ;
}
#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) {
m_alloc_mech = HostSpace::POSIX_MMAP ;
}
#endif
else {
const char * const mech =
( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : (
( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : (
( arg_alloc_mech == HostSpace::POSIX_MMAP ) ? "POSIX_MMAP" : "" ));
std::string msg ;
msg.append("Kokkos::HostSpace ");
msg.append(mech);
msg.append(" is not available" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void * HostSpace::allocate( const size_t arg_alloc_size ) const
{
static_assert( sizeof(void*) == sizeof(uintptr_t)
, "Error sizeof(void*) != sizeof(uintptr_t)" );
static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value
, "Memory alignment must be power of two" );
constexpr size_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
constexpr size_t alignment_mask = alignment - 1 ;
void * ptr = NULL;
if ( arg_alloc_size ) {
if ( m_alloc_mech == STD_MALLOC ) {
// Over-allocate to and round up to guarantee proper alignment.
size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
void * alloc_ptr = malloc( size_padded );
if (alloc_ptr) {
uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
// offset enough to record the alloc_ptr
address += sizeof(void *);
uintptr_t rem = address % alignment;
uintptr_t offset = rem ? (alignment - rem) : 0u;
address += offset;
ptr = reinterpret_cast<void *>(address);
// record the alloc'd pointer
address -= sizeof(void *);
*reinterpret_cast<void **>(address) = alloc_ptr;
}
}
#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
ptr = _mm_malloc( arg_alloc_size , alignment );
}
#endif
#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
else if ( m_alloc_mech == POSIX_MEMALIGN ) {
posix_memalign( & ptr, alignment , arg_alloc_size );
}
#endif
#if defined( KOKKOS_POSIX_MMAP_FLAGS )
else if ( m_alloc_mech == POSIX_MMAP ) {
constexpr size_t use_huge_pages = (1u << 27);
constexpr int prot = PROT_READ | PROT_WRITE ;
const int flags = arg_alloc_size < use_huge_pages
? KOKKOS_POSIX_MMAP_FLAGS
: KOKKOS_POSIX_MMAP_FLAGS_HUGE ;
// read write access to private memory
ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */
, arg_alloc_size /* size in bytes */
, prot /* memory protection */
, flags /* visibility of updates */
, -1 /* file descriptor */
, 0 /* offset */
);
/* Associated reallocation:
ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
*/
}
#endif
}
if ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) {
Kokkos::Impl::throw_runtime_exception( "Kokkos::HostSpace aligned allocation failed" );
}
return ptr;
}
void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
{
if ( arg_alloc_ptr ) {
if ( m_alloc_mech == STD_MALLOC ) {
void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
free( alloc_ptr );
}
#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
_mm_free( arg_alloc_ptr );
}
#endif
#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
else if ( m_alloc_mech == POSIX_MEMALIGN ) {
free( arg_alloc_ptr );
}
#endif
#if defined( KOKKOS_POSIX_MMAP_FLAGS )
else if ( m_alloc_mech == POSIX_MMAP ) {
munmap( arg_alloc_ptr , arg_alloc_size );
}
#endif
}
}
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
namespace Impl {
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ;
void
SharedAllocationRecord< Kokkos::HostSpace , void >::
deallocate( SharedAllocationRecord< void , void > * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
SharedAllocationRecord< Kokkos::HostSpace , void >::
~SharedAllocationRecord()
{
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
SharedAllocationRecord< Kokkos::HostSpace , void >::
SharedAllocationRecord( const Kokkos::HostSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_space( arg_space )
{
// Fill in the Header information
RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
strncpy( RecordBase::m_alloc_ptr->m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
}
SharedAllocationRecord< Kokkos::HostSpace , void > *
SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr )
{
typedef SharedAllocationHeader Header ;
typedef SharedAllocationRecord< Kokkos::HostSpace , void > RecordHost ;
SharedAllocationHeader const * const head = Header::get_header( alloc_ptr );
RecordHost * const record = static_cast< RecordHost * >( head->m_record );
if ( record->m_alloc_ptr != head ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) );
}
return record ;
}
// Iterate records to print orphaned memory ...
void SharedAllocationRecord< Kokkos::HostSpace , void >::
print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace {
const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
}
namespace Impl {
void init_lock_array_host_space() {
static int is_initialized = 0;
if(! is_initialized)
for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
HOST_SPACE_ATOMIC_LOCKS[i] = 0;
}
bool lock_address_host_space(void* ptr) {
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0 , 1);
}
void unlock_address_host_space(void* ptr) {
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0);
}
}
}

View File

@ -0,0 +1,73 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
#define KOKKOS_MEMORY_FENCE
namespace Kokkos {
//----------------------------------------------------------------------------
KOKKOS_FORCEINLINE_FUNCTION
void memory_fence()
{
#if defined( KOKKOS_ATOMICS_USE_CUDA )
__threadfence();
#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
__sync_synchronize();
#elif defined( KOKKOS_ATOMICS_USE_INTEL )
_mm_mfence();
#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
#pragma omp flush
#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
MemoryBarrier();
#else
#error "Error: memory_fence() not defined"
#endif
}
} // namespace kokkos
#endif

View File

@ -0,0 +1,84 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
#define KOKKOS_PHYSICAL_LAYOUT_HPP
#include <Kokkos_View.hpp>
namespace Kokkos {
namespace Impl {
struct PhysicalLayout {
enum LayoutType {Left,Right,Scalar,Error};
LayoutType layout_type;
int rank;
long long int stride[8]; //distance between two neighboring elements in a given dimension
template< class T , class L , class D , class M >
PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
: layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : (
is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
, rank( view.Rank )
{
for(int i=0;i<8;i++) stride[i] = 0;
view.stride( stride );
}
#ifdef KOKKOS_HAVE_CUDA
template< class T , class L , class D , class M >
PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
: layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft >::value ? Left : (
is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
, rank( view.Rank )
{
for(int i=0;i<8;i++) stride[i] = 0;
view.stride( stride );
}
#endif
};
}
}
#endif

View File

@ -0,0 +1,57 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOSP_DEVICE_INFO_HPP
#define KOKKOSP_DEVICE_INFO_HPP
namespace Kokkos {
namespace Experimental {
struct KokkosPDeviceInfo {
uint32_t deviceID;
};
}
}
#endif

View File

@ -0,0 +1,141 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <impl/Kokkos_Profiling_Interface.hpp>
#ifdef KOKKOSP_ENABLE_PROFILING
#include <string.h>
namespace Kokkos {
namespace Experimental {
bool profileLibraryLoaded() {
return (NULL != initProfileLibrary);
}
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginForCallee) {
Kokkos::fence();
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
}
};
void endParallelFor(const uint64_t kernelID) {
if(NULL != endForCallee) {
Kokkos::fence();
(*endForCallee)(kernelID);
}
};
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginScanCallee) {
Kokkos::fence();
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
}
};
void endParallelScan(const uint64_t kernelID) {
if(NULL != endScanCallee) {
Kokkos::fence();
(*endScanCallee)(kernelID);
}
};
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginReduceCallee) {
Kokkos::fence();
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
}
};
void endParallelReduce(const uint64_t kernelID) {
if(NULL != endReduceCallee) {
Kokkos::fence();
(*endReduceCallee)(kernelID);
}
};
void initialize() {
void* firstProfileLibrary;
char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
char* profileLibraryName = strtok(envProfileLibrary, ";");
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
if(NULL == firstProfileLibrary) {
std::cerr << "Error: Unable to load KokkosP library: " <<
profileLibraryName << std::endl;
} else {
std::cout << "KOKKOSP: Library Loaded: " << profileLibraryName << std::endl;
beginForCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
beginScanCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
beginReduceCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
endScanCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
endForCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
endReduceCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
initProfileLibrary = (initFunction) dlsym(firstProfileLibrary, "kokkosp_init_library");
finalizeProfileLibrary = (finalizeFunction) dlsym(firstProfileLibrary, "kokkosp_finalize_library");
}
}
if(NULL != initProfileLibrary) {
(*initProfileLibrary)(0,
(uint64_t) KOKKOSP_INTERFACE_VERSION,
(uint32_t) 0,
NULL);
}
};
void finalize() {
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
}
};
}
}
#endif

View File

@ -0,0 +1,98 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOSP_INTERFACE_HPP
#define KOKKOSP_INTERFACE_HPP
#include <cstddef>
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#include <string>
#ifdef KOKKOSP_ENABLE_PROFILING
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
#include <dlfcn.h>
#include <iostream>
#include <stdlib.h>
#endif
#define KOKKOSP_INTERFACE_VERSION 20150628
#ifdef KOKKOSP_ENABLE_PROFILING
namespace Kokkos {
namespace Experimental {
typedef void (*initFunction)(const int,
const uint64_t,
const uint32_t,
KokkosPDeviceInfo*);
typedef void (*finalizeFunction)();
typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
typedef void (*endFunction)(uint64_t);
static initFunction initProfileLibrary = NULL;
static finalizeFunction finalizeProfileLibrary = NULL;
static beginFunction beginForCallee = NULL;
static beginFunction beginScanCallee = NULL;
static beginFunction beginReduceCallee = NULL;
static endFunction endForCallee = NULL;
static endFunction endScanCallee = NULL;
static endFunction endReduceCallee = NULL;
bool profileLibraryLoaded();
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelFor(const uint64_t kernelID);
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelScan(const uint64_t kernelID);
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelReduce(const uint64_t kernelID);
void initialize();
void finalize();
}
}
#endif
#endif

View File

@ -0,0 +1,119 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdlib.h>
#include <sstream>
#include <Kokkos_Serial.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace SerialImpl {
Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
Sentinel::~Sentinel()
{
if ( m_scratch ) { free( m_scratch ); }
m_scratch = 0 ;
m_reduce_end = 0 ;
m_shared_end = 0 ;
}
Sentinel & Sentinel::singleton()
{
static Sentinel s ; return s ;
}
inline
unsigned align( unsigned n )
{
enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
return ( n + MASK ) & ~MASK ;
}
} // namespace
SerialTeamMember::SerialTeamMember( int arg_league_rank
, int arg_league_size
, int arg_shared_size
)
: m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
, arg_shared_size )
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
{}
} // namespace Impl
void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
{
static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
reduce_size = Impl::SerialImpl::align( reduce_size );
shared_size = Impl::SerialImpl::align( shared_size );
if ( ( s.m_reduce_end < reduce_size ) ||
( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
if ( s.m_scratch ) { free( s.m_scratch ); }
if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
s.m_scratch = malloc( s.m_shared_end );
}
return s.m_scratch ;
}
} // namespace Kokkos
#endif // defined( KOKKOS_HAVE_SERIAL )

View File

@ -0,0 +1,336 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#include <impl/Kokkos_Serial_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
#include <stdlib.h>
#include <stdexcept>
#include <iostream>
#include <sstream>
#include <string>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
TaskPolicy< Kokkos::Serial >::member_type &
TaskPolicy< Kokkos::Serial >::member_single()
{
static member_type s(0,1,0);
return s ;
}
} // namespace Experimental
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
namespace Impl {
typedef TaskMember< Kokkos::Serial , void , void > Task ;
//----------------------------------------------------------------------------
namespace {
inline
unsigned padded_sizeof_derived( unsigned sizeof_derived )
{
return sizeof_derived +
( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
}
} // namespace
void Task::deallocate( void * ptr )
{
free( ptr );
}
void * Task::allocate( const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity )
{
return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
}
Task::~TaskMember()
{
}
Task::TaskMember( const Task::function_verify_type arg_verify
, const Task::function_dealloc_type arg_dealloc
, const Task::function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( arg_verify )
, m_apply( arg_apply )
, m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
, m_wait( 0 )
, m_next( 0 )
, m_dep_capacity( arg_dependence_capacity )
, m_dep_size( 0 )
, m_ref_count( 0 )
, m_state( TASK_STATE_CONSTRUCTING )
{
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
Task::TaskMember( const Task::function_dealloc_type arg_dealloc
, const Task::function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: m_dealloc( arg_dealloc )
, m_verify( & Task::verify_type<void> )
, m_apply( arg_apply )
, m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
, m_wait( 0 )
, m_next( 0 )
, m_dep_capacity( arg_dependence_capacity )
, m_dep_size( 0 )
, m_ref_count( 0 )
, m_state( TASK_STATE_CONSTRUCTING )
{
for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
}
//----------------------------------------------------------------------------
void Task::throw_error_add_dependence() const
{
std::cerr << "TaskMember< Serial >::add_dependence ERROR"
<< " state(" << m_state << ")"
<< " dep_size(" << m_dep_size << ")"
<< std::endl ;
throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR");
}
void Task::throw_error_verify_type()
{
throw std::runtime_error("TaskMember< Serial >::verify_type ERROR");
}
//----------------------------------------------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
{
static const char msg_error_header[] = "Kokkos::Experimental::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ;
static const char msg_error_count[] = ": negative reference count" ;
static const char msg_error_complete[] = ": destroy task that is not complete" ;
static const char msg_error_dependences[] = ": destroy task that has dependences" ;
static const char msg_error_exception[] = ": caught internal exception" ;
const char * msg_error = 0 ;
try {
if ( *lhs ) {
const int count = --((**lhs).m_ref_count);
if ( 0 == count ) {
// Reference count at zero, delete it
// Should only be deallocating a completed task
if ( (**lhs).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
// A completed task should not have dependences...
for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
}
}
else {
msg_error = msg_error_complete ;
}
if ( 0 == msg_error ) {
// Get deletion function and apply it
const Task::function_dealloc_type d = (**lhs).m_dealloc ;
(*d)( *lhs );
}
}
else if ( count <= 0 ) {
msg_error = msg_error_count ;
}
}
if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); }
*lhs = rhs ;
}
catch( ... ) {
if ( 0 == msg_error ) msg_error = msg_error_exception ;
}
if ( 0 != msg_error ) {
if ( no_throw ) {
std::cerr << msg_error_header << msg_error << std::endl ;
std::cerr.flush();
}
else {
std::string msg(msg_error_header);
msg.append(msg_error);
throw std::runtime_error( msg );
}
}
}
#endif
namespace {
Task * s_ready = 0 ;
Task * s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) );
}
void Task::schedule()
{
// Execute ready tasks in case the task being scheduled
// is dependent upon a waiting and ready task.
Task::execute_ready_tasks();
// spawning : Constructing -> Waiting
// respawning : Executing -> Waiting
// updating : Waiting -> Waiting
// Must not be in a dependence linked list: 0 == t->m_next
const bool ok_state = TASK_STATE_COMPLETE != m_state ;
const bool ok_list = 0 == m_next ;
if ( ok_state && ok_list ) {
// Will be waiting for execution upon return from this function
m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
// Insert this task into another dependence that is not complete
int i = 0 ;
for ( ; i < m_dep_size ; ++i ) {
Task * const y = m_dep[i] ;
if ( y && s_denied != ( m_next = y->m_wait ) ) {
y->m_wait = this ; // CAS( & y->m_wait , m_next , this );
break ;
}
}
if ( i == m_dep_size ) {
// All dependences are complete, insert into the ready list
m_next = s_ready ;
s_ready = this ; // CAS( & s_ready , m_next = s_ready , this );
}
}
else {
throw std::runtime_error(std::string("Kokkos::Experimental::Impl::Task spawn or respawn state error"));
}
}
void Task::execute_ready_tasks()
{
while ( s_ready ) {
// Remove this task from the ready list
// Task * task ;
// while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
Task * const task = s_ready ;
s_ready = task->m_next ;
task->m_next = 0 ;
// precondition: task->m_state = TASK_STATE_WAITING
// precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE for all i
// precondition: does not exist T such that T->m_wait = task
// precondition: does not exist T such that T->m_next = task
task->m_state = Kokkos::Experimental::TASK_STATE_EXECUTING ;
(*task->m_apply)( task );
if ( task->m_state == Kokkos::Experimental::TASK_STATE_EXECUTING ) {
// task did not respawn itself
task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
// release dependences:
for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
assign( task->m_dep + i , 0 );
}
// Stop other tasks from adding themselves to 'task->m_wait' ;
Task * x ;
// CAS( & task->m_wait , x = task->m_wait , s_denied );
x = task->m_wait ; task->m_wait = s_denied ;
// update tasks waiting on this task
while ( x ) {
Task * const next = x->m_next ;
x->m_next = 0 ;
x->schedule(); // could happen concurrently
x = next ;
}
}
}
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
#endif // defined( KOKKOS_HAVE_SERIAL )

View File

@ -0,0 +1,845 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
#define KOKKOS_SERIAL_TASKPOLICY_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
#include <string>
#include <typeinfo>
#include <stdexcept>
#include <Kokkos_Serial.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_View.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
/* Inheritance structure to allow static_cast from the task root type
* and a task's FunctorType.
*
* task_root_type == TaskMember< Space , void , void >
*
* TaskMember< PolicyType , ResultType , FunctorType >
* : TaskMember< PolicyType::Space , ResultType , FunctorType >
* { ... };
*
* TaskMember< Space , ResultType , FunctorType >
* : TaskMember< Space , ResultType , void >
* , FunctorType
* { ... };
*
* when ResultType != void
*
* TaskMember< Space , ResultType , void >
* : TaskMember< Space , void , void >
* { ... };
*
*/
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
/** \brief Base class for all tasks in the Serial execution space */
template<>
class TaskMember< Kokkos::Serial , void , void >
{
public:
typedef void (* function_apply_type) ( TaskMember * );
typedef void (* function_dealloc_type)( TaskMember * );
typedef TaskMember * (* function_verify_type) ( TaskMember * );
private:
const function_dealloc_type m_dealloc ; ///< Deallocation
const function_verify_type m_verify ; ///< Result type verification
const function_apply_type m_apply ; ///< Apply function
TaskMember ** const m_dep ; ///< Dependences
TaskMember * m_wait ; ///< Linked list of tasks waiting on this task
TaskMember * m_next ; ///< Linked list of tasks waiting on a different task
const int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
int m_ref_count ; ///< Reference count
int m_state ; ///< State of the task
// size = 6 Pointers + 4 ints
TaskMember() /* = delete */ ;
TaskMember( const TaskMember & ) /* = delete */ ;
TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
static void deallocate( void * );
void throw_error_add_dependence() const ;
static void throw_error_verify_type();
template < class DerivedTaskType >
static
void deallocate( TaskMember * t )
{
DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
ptr->~DerivedTaskType();
deallocate( (void *) ptr );
}
protected :
~TaskMember();
// Used by TaskMember< Serial , ResultType , void >
TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
// Used for TaskMember< Serial , void , void >
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
);
public:
template< typename ResultType >
KOKKOS_FUNCTION static
TaskMember * verify_type( TaskMember * t )
{
enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
if ( check_type && t != 0 ) {
// Verify that t->m_verify is this function
const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
if ( t->m_verify != self ) {
t = 0 ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
throw_error_verify_type();
#endif
}
}
return t ;
}
//----------------------------------------
/* Inheritence Requirements on task types:
* typedef FunctorType::value_type value_type ;
* class DerivedTaskType
* : public TaskMember< Serial , value_type , FunctorType >
* { ... };
* class TaskMember< Serial , value_type , FunctorType >
* : public TaskMember< Serial , value_type , void >
* , public Functor
* { ... };
* If value_type != void
* class TaskMember< Serial , value_type , void >
* : public TaskMember< Serial , void , void >
*
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
*
*/
/** \brief Allocate and construct a single-thread task */
template< class DerivedTaskType >
static
TaskMember * create( const typename DerivedTaskType::functor_type & arg_functor
, const unsigned arg_dependence_capacity
)
{
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ;
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
, & TaskMember::template apply_single< functor_type , value_type >
, sizeof(DerivedTaskType)
, arg_dependence_capacity
, arg_functor );
return static_cast< TaskMember * >( task );
}
/** \brief Allocate and construct a data parallel task */
template< class DerivedTaskType >
static
TaskMember * create( const typename DerivedTaskType::policy_type & arg_policy
, const typename DerivedTaskType::functor_type & arg_functor
, const unsigned arg_dependence_capacity
)
{
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
, sizeof(DerivedTaskType)
, arg_dependence_capacity
, arg_policy
, arg_functor
);
return static_cast< TaskMember * >( task );
}
/** \brief Allocate and construct a thread-team task */
template< class DerivedTaskType >
static
TaskMember * create_team( const typename DerivedTaskType::functor_type & arg_functor
, const unsigned arg_dependence_capacity
)
{
typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ;
DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
, & TaskMember::template apply_team< functor_type , value_type >
, sizeof(DerivedTaskType)
, arg_dependence_capacity
, arg_functor );
return static_cast< TaskMember * >( task );
}
void schedule();
static void execute_ready_tasks();
//----------------------------------------
typedef FutureValueTypeIsVoidError get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return get_result_type() ; }
KOKKOS_INLINE_FUNCTION
Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
//----------------------------------------
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
static
void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
#else
KOKKOS_INLINE_FUNCTION static
void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
#endif
KOKKOS_INLINE_FUNCTION
TaskMember * get_dependence( int i ) const
{ return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
KOKKOS_INLINE_FUNCTION
int get_dependence() const
{ return m_dep_size ; }
KOKKOS_INLINE_FUNCTION
void clear_dependence()
{
for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
m_dep_size = 0 ;
}
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskMember * before )
{
if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
Kokkos::Experimental::TASK_STATE_EXECUTING == m_state ) &&
m_dep_size < m_dep_capacity ) {
assign( m_dep + m_dep_size , before );
++m_dep_size ;
}
else {
throw_error_add_dependence();
}
}
//----------------------------------------
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Serial , ResultType , FunctorType >
// : public TaskMember< Kokkos::Serial , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
}
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Serial , ResultType , FunctorType >
// : public TaskMember< Kokkos::Serial , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
}
//----------------------------------------
template< class FunctorType , class ResultType >
static
void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
typedef Kokkos::Impl::SerialTeamMember member_type ;
// TaskMember< Kokkos::Serial , ResultType , FunctorType >
// : public TaskMember< Kokkos::Serial , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
m.FunctorType::apply( member_type(0,1,0) , m.m_result );
}
template< class FunctorType , class ResultType >
static
void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
typedef Kokkos::Impl::SerialTeamMember member_type ;
// TaskMember< Kokkos::Serial , ResultType , FunctorType >
// : public TaskMember< Kokkos::Serial , ResultType , void >
// , public FunctorType
// { ... };
derived_type & m = * static_cast< derived_type * >( t );
m.FunctorType::apply( member_type(0,1,0) );
}
};
//----------------------------------------------------------------------------
/** \brief Base class for tasks with a result value in the Serial execution space.
*
* The FunctorType must be void because this class is accessed by the
* Future class for the task and result value.
*
* Must be derived from TaskMember<S,void,void> 'root class' so the Future class
* can correctly static_cast from the 'root class' to this class.
*/
template < class ResultType >
class TaskMember< Kokkos::Serial , ResultType , void >
: public TaskMember< Kokkos::Serial , void , void >
{
public:
ResultType m_result ;
typedef const ResultType & get_result_type ;
KOKKOS_INLINE_FUNCTION
get_result_type get() const { return m_result ; }
protected:
typedef TaskMember< Kokkos::Serial , void , void > task_root_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_type function_apply_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
)
: task_root_type( & task_root_type::template verify_type< ResultType >
, arg_dealloc
, arg_apply
, arg_sizeof_derived
, arg_dependence_capacity )
, m_result()
{}
};
template< class ResultType , class FunctorType >
class TaskMember< Kokkos::Serial , ResultType , FunctorType >
: public TaskMember< Kokkos::Serial , ResultType , void >
, public FunctorType
{
public:
typedef FunctorType functor_type ;
typedef TaskMember< Kokkos::Serial , void , void > task_root_type ;
typedef TaskMember< Kokkos::Serial , ResultType , void > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_type function_apply_type ;
inline
TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_type arg_apply
, const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity
, const functor_type & arg_functor
)
: task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
, functor_type( arg_functor )
{}
};
//----------------------------------------------------------------------------
/** \brief ForEach task in the Serial execution space
*
* Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
* so that Functor can be cast to task root type without knowing policy.
*/
template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
, ResultType
, FunctorType >
: TaskMember< Kokkos::Serial , ResultType , FunctorType >
{
public:
typedef FunctorType functor_type ;
typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > policy_type ;
private:
friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
typedef TaskMember< Kokkos::Serial , void , void > task_root_type ;
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
policy_type m_policy ;
template< class Tag >
inline
typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value >::type
apply_policy() const
{
const typename policy_type::member_type e = m_policy.end();
for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
functor_type::operator()(i);
}
}
template< class Tag >
inline
typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value >::type
apply_policy() const
{
const Tag tag ;
const typename policy_type::member_type e = m_policy.end();
for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
functor_type::operator()(tag,i);
}
}
static
void apply_parallel( task_root_type * t )
{
static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >();
task_root_type::template apply_single< functor_type , ResultType >( t );
}
TaskForEach( const function_dealloc_type arg_dealloc
, const int arg_sizeof_derived
, const int arg_dependence_capacity
, const policy_type & arg_policy
, const functor_type & arg_functor
)
: task_base_type( arg_dealloc
, & apply_parallel
, arg_sizeof_derived
, arg_dependence_capacity
, arg_functor )
, m_policy( arg_policy )
{}
TaskForEach() /* = delete */ ;
TaskForEach( const TaskForEach & ) /* = delete */ ;
TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ;
};
//----------------------------------------------------------------------------
/** \brief Reduce task in the Serial execution space
*
* Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
* so that Functor can be cast to task root type without knowing policy.
*/
template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
, ResultType
, FunctorType >
: TaskMember< Kokkos::Serial , ResultType , FunctorType >
{
public:
typedef FunctorType functor_type ;
typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > policy_type ;
private:
friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
typedef TaskMember< Kokkos::Serial , void , void > task_root_type ;
typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
policy_type m_policy ;
template< class Tag >
inline
void apply_policy( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
{
Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
const typename policy_type::member_type e = m_policy.end();
for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
functor_type::operator()( i, result );
}
}
template< class Tag >
inline
void apply_policy( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
{
Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
const Tag tag ;
const typename policy_type::member_type e = m_policy.end();
for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
functor_type::operator()( tag, i, result );
}
}
static
void apply_parallel( task_root_type * t )
{
TaskReduce * const task = static_cast<TaskReduce*>(t);
task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result );
task_root_type::template apply_single< functor_type , ResultType >( t );
}
TaskReduce( const function_dealloc_type arg_dealloc
, const int arg_sizeof_derived
, const int arg_dependence_capacity
, const policy_type & arg_policy
, const functor_type & arg_functor
)
: task_base_type( arg_dealloc
, & apply_parallel
, arg_sizeof_derived
, arg_dependence_capacity
, arg_functor )
, m_policy( arg_policy )
{}
TaskReduce() /* = delete */ ;
TaskReduce( const TaskReduce & ) /* = delete */ ;
TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ;
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
template<>
class TaskPolicy< Kokkos::Serial >
{
public:
typedef Kokkos::Serial execution_space ;
typedef Kokkos::Impl::SerialTeamMember member_type ;
private:
typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
template< class FunctorType >
static inline
const task_root_type * get_task_root( const FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
}
template< class FunctorType >
static inline
task_root_type * get_task_root( FunctorType * f )
{
typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
return static_cast< task_root_type * >( static_cast< task_type * >(f) );
}
unsigned m_default_dependence_capacity ;
public:
KOKKOS_INLINE_FUNCTION
TaskPolicy() : m_default_dependence_capacity(4) {}
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {}
KOKKOS_INLINE_FUNCTION
explicit
TaskPolicy( const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity ) {}
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy &
, const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity ) {}
TaskPolicy & operator = ( const TaskPolicy &rhs )
{
m_default_dependence_capacity = rhs.m_default_dependence_capacity;
return *this;
}
//----------------------------------------
template< class ValueType >
KOKKOS_INLINE_FUNCTION
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
f.m_task->schedule();
#endif
return f ;
}
// Create single-thread task
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create< task_type >(
functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
#endif
);
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create_team( const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskMember< execution_space , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create_team< task_type >(
functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
#endif
);
}
// Create parallel foreach task
template< class PolicyType , class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create_foreach( const PolicyType & policy
, const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create< task_type >( policy , functor ,
( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
#endif
);
}
// Create parallel reduce task
template< class PolicyType , class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< typename FunctorType::value_type , execution_space >
create_reduce( const PolicyType & policy
, const FunctorType & functor
, const unsigned dependence_capacity = ~0u ) const
{
typedef typename FunctorType::value_type value_type ;
typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ;
return Future< value_type , execution_space >(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
task_root_type::create< task_type >( policy , functor ,
( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
#endif
);
}
// Add dependence
template< class A1 , class A2 , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( const Future<A1,A2> & after
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
&&
Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
after.m_task->add_dependence( before.m_task );
#endif
}
//----------------------------------------
// Functions for an executing task functor to query dependences,
// set new dependences, and respawn itself.
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
Future< void , execution_space >
get_dependence( const FunctorType * task_functor , int i ) const
{
return Future<void,execution_space>(
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
get_task_root(task_functor)->get_dependence(i)
#endif
);
}
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
int get_dependence( const FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return get_task_root(task_functor)->get_dependence(); }
#else
{ return 0 ; }
#endif
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void clear_dependence( FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->clear_dependence(); }
#else
{}
#endif
template< class FunctorType , class A3 , class A4 >
KOKKOS_INLINE_FUNCTION
void add_dependence( FunctorType * task_functor
, const Future<A3,A4> & before
, typename Kokkos::Impl::enable_if
< Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
>::type * = 0
) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->add_dependence( before.m_task ); }
#else
{}
#endif
template< class FunctorType >
KOKKOS_INLINE_FUNCTION
void respawn( FunctorType * task_functor ) const
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ get_task_root(task_functor)->schedule(); }
#else
{}
#endif
//----------------------------------------
static member_type & member_single();
};
inline
void wait( TaskPolicy< Kokkos::Serial > & )
{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); }
} /* namespace Experimental */
} // namespace Kokkos
//----------------------------------------------------------------------------
#endif /* defined( KOKKOS_HAVE_SERIAL ) */
#endif /* #define KOKKOS_SERIAL_TASK_HPP */

View File

@ -0,0 +1,178 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <sstream>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Shape.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void assert_counts_are_equal_throw(
const size_t x_count ,
const size_t y_count )
{
std::ostringstream msg ;
msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
<< x_count << " != " << y_count << " )" ;
throw_runtime_exception( msg.str() );
}
void assert_shapes_are_equal_throw(
const unsigned x_scalar_size ,
const unsigned x_rank ,
const size_t x_N0 , const unsigned x_N1 ,
const unsigned x_N2 , const unsigned x_N3 ,
const unsigned x_N4 , const unsigned x_N5 ,
const unsigned x_N6 , const unsigned x_N7 ,
const unsigned y_scalar_size ,
const unsigned y_rank ,
const size_t y_N0 , const unsigned y_N1 ,
const unsigned y_N2 , const unsigned y_N3 ,
const unsigned y_N4 , const unsigned y_N5 ,
const unsigned y_N6 , const unsigned y_N7 )
{
std::ostringstream msg ;
msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
<< " scalar_size(" << x_scalar_size
<< ") rank(" << x_rank
<< ") dimension(" ;
if ( 0 < x_rank ) { msg << " " << x_N0 ; }
if ( 1 < x_rank ) { msg << " " << x_N1 ; }
if ( 2 < x_rank ) { msg << " " << x_N2 ; }
if ( 3 < x_rank ) { msg << " " << x_N3 ; }
if ( 4 < x_rank ) { msg << " " << x_N4 ; }
if ( 5 < x_rank ) { msg << " " << x_N5 ; }
if ( 6 < x_rank ) { msg << " " << x_N6 ; }
if ( 7 < x_rank ) { msg << " " << x_N7 ; }
msg << " ) } != { "
<< " scalar_size(" << y_scalar_size
<< ") rank(" << y_rank
<< ") dimension(" ;
if ( 0 < y_rank ) { msg << " " << y_N0 ; }
if ( 1 < y_rank ) { msg << " " << y_N1 ; }
if ( 2 < y_rank ) { msg << " " << y_N2 ; }
if ( 3 < y_rank ) { msg << " " << y_N3 ; }
if ( 4 < y_rank ) { msg << " " << y_N4 ; }
if ( 5 < y_rank ) { msg << " " << y_N5 ; }
if ( 6 < y_rank ) { msg << " " << y_N6 ; }
if ( 7 < y_rank ) { msg << " " << y_N7 ; }
msg << " ) } )" ;
throw_runtime_exception( msg.str() );
}
void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
const size_t rank ,
const size_t n0 , const size_t n1 ,
const size_t n2 , const size_t n3 ,
const size_t n4 , const size_t n5 ,
const size_t n6 , const size_t n7 ,
const size_t arg_rank ,
const size_t i0 , const size_t i1 ,
const size_t i2 , const size_t i3 ,
const size_t i4 , const size_t i5 ,
const size_t i6 , const size_t i7 )
{
std::ostringstream msg ;
msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
if ( 0 < rank ) { msg << " " << n0 ; }
if ( 1 < rank ) { msg << " " << n1 ; }
if ( 2 < rank ) { msg << " " << n2 ; }
if ( 3 < rank ) { msg << " " << n3 ; }
if ( 4 < rank ) { msg << " " << n4 ; }
if ( 5 < rank ) { msg << " " << n5 ; }
if ( 6 < rank ) { msg << " " << n6 ; }
if ( 7 < rank ) { msg << " " << n7 ; }
msg << " } index = {" ;
if ( 0 < arg_rank ) { msg << " " << i0 ; }
if ( 1 < arg_rank ) { msg << " " << i1 ; }
if ( 2 < arg_rank ) { msg << " " << i2 ; }
if ( 3 < arg_rank ) { msg << " " << i3 ; }
if ( 4 < arg_rank ) { msg << " " << i4 ; }
if ( 5 < arg_rank ) { msg << " " << i5 ; }
if ( 6 < arg_rank ) { msg << " " << i6 ; }
if ( 7 < arg_rank ) { msg << " " << i7 ; }
msg << " } )" ;
throw_runtime_exception( msg.str() );
}
void assert_shape_effective_rank1_at_leastN_throw(
const size_t x_rank , const size_t x_N0 ,
const size_t x_N1 , const size_t x_N2 ,
const size_t x_N3 , const size_t x_N4 ,
const size_t x_N5 , const size_t x_N6 ,
const size_t x_N7 ,
const size_t N0 )
{
std::ostringstream msg ;
msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
if ( 0 < x_rank ) { msg << " " << x_N0 ; }
if ( 1 < x_rank ) { msg << " " << x_N1 ; }
if ( 2 < x_rank ) { msg << " " << x_N2 ; }
if ( 3 < x_rank ) { msg << " " << x_N3 ; }
if ( 4 < x_rank ) { msg << " " << x_N4 ; }
if ( 5 < x_rank ) { msg << " " << x_N5 ; }
if ( 6 < x_rank ) { msg << " " << x_N6 ; }
if ( 7 < x_rank ) { msg << " " << x_N7 ; }
msg << " } N = " << N0 << " )" ;
throw_runtime_exception( msg.str() );
}
}
}

View File

@ -0,0 +1,917 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_SHAPE_HPP
#define KOKKOS_SHAPE_HPP
#include <typeinfo>
#include <utility>
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
/** \brief The shape of a Kokkos with dynamic and static dimensions.
* Dynamic dimensions are member values and static dimensions are
* 'static const' values.
*
* The upper bound on the array rank is eight.
*/
template< unsigned ScalarSize ,
unsigned Rank ,
unsigned s0 = 1 ,
unsigned s1 = 1 ,
unsigned s2 = 1 ,
unsigned s3 = 1 ,
unsigned s4 = 1 ,
unsigned s5 = 1 ,
unsigned s6 = 1 ,
unsigned s7 = 1 >
struct Shape ;
//----------------------------------------------------------------------------
/** \brief Shape equality if the value type, layout, and dimensions
* are equal.
*/
template< unsigned xSize , unsigned xRank ,
unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
unsigned ySize , unsigned yRank ,
unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
KOKKOS_INLINE_FUNCTION
bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
{
enum { same_size = xSize == ySize };
enum { same_rank = xRank == yRank };
return same_size && same_rank &&
size_t( x.N0 ) == size_t( y.N0 ) &&
unsigned( x.N1 ) == unsigned( y.N1 ) &&
unsigned( x.N2 ) == unsigned( y.N2 ) &&
unsigned( x.N3 ) == unsigned( y.N3 ) &&
unsigned( x.N4 ) == unsigned( y.N4 ) &&
unsigned( x.N5 ) == unsigned( y.N5 ) &&
unsigned( x.N6 ) == unsigned( y.N6 ) &&
unsigned( x.N7 ) == unsigned( y.N7 ) ;
}
template< unsigned xSize , unsigned xRank ,
unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
unsigned ySize ,unsigned yRank ,
unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
KOKKOS_INLINE_FUNCTION
bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
{ return ! operator == ( x , y ); }
//----------------------------------------------------------------------------
void assert_counts_are_equal_throw(
const size_t x_count ,
const size_t y_count );
inline
void assert_counts_are_equal(
const size_t x_count ,
const size_t y_count )
{
if ( x_count != y_count ) {
assert_counts_are_equal_throw( x_count , y_count );
}
}
void assert_shapes_are_equal_throw(
const unsigned x_scalar_size ,
const unsigned x_rank ,
const size_t x_N0 , const unsigned x_N1 ,
const unsigned x_N2 , const unsigned x_N3 ,
const unsigned x_N4 , const unsigned x_N5 ,
const unsigned x_N6 , const unsigned x_N7 ,
const unsigned y_scalar_size ,
const unsigned y_rank ,
const size_t y_N0 , const unsigned y_N1 ,
const unsigned y_N2 , const unsigned y_N3 ,
const unsigned y_N4 , const unsigned y_N5 ,
const unsigned y_N6 , const unsigned y_N7 );
template< unsigned xSize , unsigned xRank ,
unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
unsigned ySize , unsigned yRank ,
unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
inline
void assert_shapes_are_equal(
const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
{
typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
if ( x != y ) {
assert_shapes_are_equal_throw(
x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
}
}
template< unsigned xSize , unsigned xRank ,
unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
unsigned ySize , unsigned yRank ,
unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
void assert_shapes_equal_dimension(
const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
{
typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
// Omit comparison of scalar_size.
if ( unsigned( x.rank ) != unsigned( y.rank ) ||
size_t( x.N0 ) != size_t( y.N0 ) ||
unsigned( x.N1 ) != unsigned( y.N1 ) ||
unsigned( x.N2 ) != unsigned( y.N2 ) ||
unsigned( x.N3 ) != unsigned( y.N3 ) ||
unsigned( x.N4 ) != unsigned( y.N4 ) ||
unsigned( x.N5 ) != unsigned( y.N5 ) ||
unsigned( x.N6 ) != unsigned( y.N6 ) ||
unsigned( x.N7 ) != unsigned( y.N7 ) ) {
assert_shapes_are_equal_throw(
x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
}
}
//----------------------------------------------------------------------------
template< class ShapeType > struct assert_shape_is_rank_zero ;
template< class ShapeType > struct assert_shape_is_rank_one ;
template< unsigned Size >
struct assert_shape_is_rank_zero< Shape<Size,0> >
: public true_type {};
template< unsigned Size , unsigned s0 >
struct assert_shape_is_rank_one< Shape<Size,1,s0> >
: public true_type {};
//----------------------------------------------------------------------------
/** \brief Array bounds assertion templated on the execution space
* to allow device-specific abort code.
*/
template< class Space >
struct AssertShapeBoundsAbort ;
template<>
struct AssertShapeBoundsAbort< Kokkos::HostSpace >
{
static void apply( const size_t rank ,
const size_t n0 , const size_t n1 ,
const size_t n2 , const size_t n3 ,
const size_t n4 , const size_t n5 ,
const size_t n6 , const size_t n7 ,
const size_t arg_rank ,
const size_t i0 , const size_t i1 ,
const size_t i2 , const size_t i3 ,
const size_t i4 , const size_t i5 ,
const size_t i6 , const size_t i7 );
};
template< class ExecutionSpace >
struct AssertShapeBoundsAbort
{
KOKKOS_INLINE_FUNCTION
static void apply( const size_t rank ,
const size_t n0 , const size_t n1 ,
const size_t n2 , const size_t n3 ,
const size_t n4 , const size_t n5 ,
const size_t n6 , const size_t n7 ,
const size_t arg_rank ,
const size_t i0 , const size_t i1 ,
const size_t i2 , const size_t i3 ,
const size_t i4 , const size_t i5 ,
const size_t i6 , const size_t i7 )
{
AssertShapeBoundsAbort< Kokkos::HostSpace >
::apply( rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
}
};
template< class ShapeType >
KOKKOS_INLINE_FUNCTION
void assert_shape_bounds( const ShapeType & shape ,
const size_t arg_rank ,
const size_t i0 ,
const size_t i1 = 0 ,
const size_t i2 = 0 ,
const size_t i3 = 0 ,
const size_t i4 = 0 ,
const size_t i5 = 0 ,
const size_t i6 = 0 ,
const size_t i7 = 0 )
{
// Must supply at least as many indices as ranks.
// Every index must be within bounds.
const bool ok = ShapeType::rank <= arg_rank &&
i0 < shape.N0 &&
i1 < shape.N1 &&
i2 < shape.N2 &&
i3 < shape.N3 &&
i4 < shape.N4 &&
i5 < shape.N5 &&
i6 < shape.N6 &&
i7 < shape.N7 ;
if ( ! ok ) {
AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
::apply( ShapeType::rank ,
shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
}
}
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
#else
#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Specialization and optimization for the Rank 0 shape.
template < unsigned ScalarSize >
struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 0 };
enum { rank = 0 };
enum { N0 = 1 };
enum { N1 = 1 };
enum { N2 = 1 };
enum { N3 = 1 };
enum { N4 = 1 };
enum { N5 = 1 };
enum { N6 = 1 };
enum { N7 = 1 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{}
};
//----------------------------------------------------------------------------
template< unsigned R > struct assign_shape_dimension ;
#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \
template<> \
struct assign_shape_dimension< R > \
{ \
template< class ShapeType > \
KOKKOS_INLINE_FUNCTION \
assign_shape_dimension( ShapeType & shape \
, typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \
) { shape.N ## R = n ; } \
};
KOKKOS_ASSIGN_SHAPE_DIMENSION(0)
KOKKOS_ASSIGN_SHAPE_DIMENSION(1)
KOKKOS_ASSIGN_SHAPE_DIMENSION(2)
KOKKOS_ASSIGN_SHAPE_DIMENSION(3)
KOKKOS_ASSIGN_SHAPE_DIMENSION(4)
KOKKOS_ASSIGN_SHAPE_DIMENSION(5)
KOKKOS_ASSIGN_SHAPE_DIMENSION(6)
KOKKOS_ASSIGN_SHAPE_DIMENSION(7)
#undef KOKKOS_ASSIGN_SHAPE_DIMENSION
//----------------------------------------------------------------------------
// All-static dimension array
template < unsigned ScalarSize ,
unsigned Rank ,
unsigned s0 ,
unsigned s1 ,
unsigned s2 ,
unsigned s3 ,
unsigned s4 ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape {
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 0 };
enum { rank = Rank };
enum { N0 = s0 };
enum { N1 = s1 };
enum { N2 = s2 };
enum { N3 = s3 };
enum { N4 = s4 };
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{}
};
// 1 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize ,
unsigned Rank ,
unsigned s1 ,
unsigned s2 ,
unsigned s3 ,
unsigned s4 ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 1 };
enum { rank = Rank };
size_t N0 ; // For 1 == dynamic_rank allow N0 > 2^32
enum { N1 = s1 };
enum { N2 = s2 };
enum { N3 = s3 };
enum { N4 = s4 };
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{ s.N0 = n0 ; }
};
// 2 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize , unsigned Rank ,
unsigned s2 ,
unsigned s3 ,
unsigned s4 ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 2 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
enum { N2 = s2 };
enum { N3 = s3 };
enum { N4 = s4 };
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{ s.N0 = n0 ; s.N1 = n1 ; }
};
// 3 == dynamic_rank <= rank <= 8
template < unsigned Rank , unsigned ScalarSize ,
unsigned s3 ,
unsigned s4 ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 3 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
enum { N3 = s3 };
enum { N4 = s4 };
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{ s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
};
// 4 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize , unsigned Rank ,
unsigned s4 ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 4 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
unsigned N3 ;
enum { N4 = s4 };
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{ s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
};
// 5 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize , unsigned Rank ,
unsigned s5 ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 5 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
unsigned N3 ;
unsigned N4 ;
enum { N5 = s5 };
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
{ s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
};
// 6 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize , unsigned Rank ,
unsigned s6 ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 6 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
unsigned N3 ;
unsigned N4 ;
unsigned N5 ;
enum { N6 = s6 };
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
{
s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
s.N4 = n4 ; s.N5 = n5 ;
}
};
// 7 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize , unsigned Rank ,
unsigned s7 >
struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 7 };
enum { rank = Rank };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
unsigned N3 ;
unsigned N4 ;
unsigned N5 ;
unsigned N6 ;
enum { N7 = s7 };
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
{
s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
}
};
// 8 == dynamic_rank <= rank <= 8
template < unsigned ScalarSize >
struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
{
enum { scalar_size = ScalarSize };
enum { rank_dynamic = 8 };
enum { rank = 8 };
unsigned N0 ;
unsigned N1 ;
unsigned N2 ;
unsigned N3 ;
unsigned N4 ;
unsigned N5 ;
unsigned N6 ;
unsigned N7 ;
KOKKOS_INLINE_FUNCTION
static
void assign( Shape & s ,
unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
{
s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
}
};
//----------------------------------------------------------------------------
template< class ShapeType , unsigned N ,
unsigned R = ShapeType::rank_dynamic >
struct ShapeInsert ;
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 0 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
N ,
ShapeType::N0 ,
ShapeType::N1 ,
ShapeType::N2 ,
ShapeType::N3 ,
ShapeType::N4 ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 1 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
N ,
ShapeType::N1 ,
ShapeType::N2 ,
ShapeType::N3 ,
ShapeType::N4 ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 2 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
N ,
ShapeType::N2 ,
ShapeType::N3 ,
ShapeType::N4 ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 3 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
0 ,
N ,
ShapeType::N3 ,
ShapeType::N4 ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 4 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
0 ,
0 ,
N ,
ShapeType::N4 ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 5 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
0 ,
0 ,
0 ,
N ,
ShapeType::N5 ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 6 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
0 ,
0 ,
0 ,
0 ,
N ,
ShapeType::N6 > type ;
};
template< class ShapeType , unsigned N >
struct ShapeInsert< ShapeType , N , 7 >
{
typedef Shape< ShapeType::scalar_size ,
ShapeType::rank + 1 ,
0 ,
0 ,
0 ,
0 ,
0 ,
0 ,
0 ,
N > type ;
};
//----------------------------------------------------------------------------
template< class DstShape , class SrcShape ,
unsigned DstRankDynamic = DstShape::rank_dynamic ,
bool DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
struct ShapeCompatible { enum { value = false }; };
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 8 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 7 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 6 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 5 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 4 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 3 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 2 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 1 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
template< class DstShape , class SrcShape >
struct ShapeCompatible< DstShape , SrcShape , 0 , true >
{
enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< unsigned ScalarSize , unsigned Rank ,
unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
typename iType >
KOKKOS_INLINE_FUNCTION
size_t dimension(
const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
const iType & r )
{
return 0 == r ? shape.N0 : (
1 == r ? shape.N1 : (
2 == r ? shape.N2 : (
3 == r ? shape.N3 : (
4 == r ? shape.N4 : (
5 == r ? shape.N5 : (
6 == r ? shape.N6 : (
7 == r ? shape.N7 : 1 )))))));
}
template< unsigned ScalarSize , unsigned Rank ,
unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
KOKKOS_INLINE_FUNCTION
size_t cardinality_count(
const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
{
return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
}
//----------------------------------------------------------------------------
} /* namespace Impl */
} /* namespace Kokkos */
#endif /* #ifndef KOKKOS_CORESHAPE_HPP */

View File

@ -0,0 +1,55 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_SINGLETON_HPP
#define KOKKOS_SINGLETON_HPP
#include <Kokkos_Macros.hpp>
#include <cstddef>
namespace Kokkos { namespace Impl {
}} // namespace Kokkos::Impl
#endif // KOKKOS_SINGLETON_HPP

View File

@ -0,0 +1,79 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_STATICASSERT_HPP
#define KOKKOS_STATICASSERT_HPP
namespace Kokkos {
namespace Impl {
template < bool , class T = void >
struct StaticAssert ;
template< class T >
struct StaticAssert< true , T > {
typedef T type ;
static const bool value = true ;
};
template < class A , class B >
struct StaticAssertSame ;
template < class A >
struct StaticAssertSame<A,A> { typedef A type ; };
template < class A , class B >
struct StaticAssertAssignable ;
template < class A >
struct StaticAssertAssignable<A,A> { typedef A type ; };
template < class A >
struct StaticAssertAssignable< const A , A > { typedef const A type ; };
} // namespace Impl
} // namespace Kokkos
#endif /* KOKKOS_STATICASSERT_HPP */

View File

@ -0,0 +1,156 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_TAGS_HPP
#define KOKKOS_TAGS_HPP
#include <impl/Kokkos_Traits.hpp>
#include <Kokkos_Core_fwd.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//----------------------------------------------------------------------------
template<class ExecutionSpace, class MemorySpace>
struct Device {
typedef ExecutionSpace execution_space;
typedef MemorySpace memory_space;
typedef Device<execution_space,memory_space> device_type;
};
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class C , class Enable = void >
struct is_memory_space : public bool_< false > {};
template< class C , class Enable = void >
struct is_execution_space : public bool_< false > {};
template< class C , class Enable = void >
struct is_execution_policy : public bool_< false > {};
template< class C , class Enable = void >
struct is_array_layout : public Impl::false_type {};
template< class C , class Enable = void >
struct is_memory_traits : public Impl::false_type {};
template< class C >
struct is_memory_space< C , typename Impl::enable_if_type< typename C::memory_space >::type >
: public bool_< Impl::is_same< C , typename C::memory_space >::value > {};
template< class C >
struct is_execution_space< C , typename Impl::enable_if_type< typename C::execution_space >::type >
: public bool_< Impl::is_same< C , typename C::execution_space >::value > {};
template< class C >
struct is_execution_policy< C , typename Impl::enable_if_type< typename C::execution_policy >::type >
: public bool_< Impl::is_same< C , typename C::execution_policy >::value > {};
template< class C >
struct is_array_layout< C , typename Impl::enable_if_type< typename C::array_layout >::type >
: public bool_< Impl::is_same< C , typename C::array_layout >::value > {};
template< class C >
struct is_memory_traits< C , typename Impl::enable_if_type< typename C::memory_traits >::type >
: public bool_< Impl::is_same< C , typename C::memory_traits >::value > {};
//----------------------------------------------------------------------------
template< class C , class Enable = void >
struct is_space : public Impl::false_type {};
template< class C >
struct is_space< C
, typename Impl::enable_if<(
Impl::is_same< C , typename C::execution_space >::value ||
Impl::is_same< C , typename C::memory_space >::value ||
Impl::is_same< C , Device<
typename C::execution_space,
typename C::memory_space> >::value
)>::type
>
: public Impl::true_type
{
typedef typename C::execution_space execution_space ;
typedef typename C::memory_space memory_space ;
// The host_memory_space defines a space with host-resident memory.
// If the execution space's memory space is host accessible then use that execution space.
// else use the HostSpace.
typedef
typename Impl::if_c< Impl::is_same< memory_space , HostSpace >::value
#ifdef KOKKOS_HAVE_CUDA
|| Impl::is_same< memory_space , CudaUVMSpace>::value
|| Impl::is_same< memory_space , CudaHostPinnedSpace>::value
#endif
, memory_space , HostSpace >::type
host_memory_space ;
// The host_execution_space defines a space which has access to HostSpace.
// If the execution space can access HostSpace then use that execution space.
// else use the DefaultHostExecutionSpace.
#ifdef KOKKOS_HAVE_CUDA
typedef
typename Impl::if_c< Impl::is_same< execution_space , Cuda >::value
, DefaultHostExecutionSpace , execution_space >::type
host_execution_space ;
#else
typedef execution_space host_execution_space;
#endif
typedef Device<host_execution_space,host_memory_space> host_mirror_space;
};
}
}
#endif

View File

@ -0,0 +1,115 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPLWALLTIME_HPP
#define KOKKOS_IMPLWALLTIME_HPP
#include <stddef.h>
#ifdef _MSC_VER
#undef KOKKOS_USE_LIBRT
#include <gettimeofday.c>
#else
#ifdef KOKKOS_USE_LIBRT
#include <ctime>
#else
#include <sys/time.h>
#endif
#endif
namespace Kokkos {
namespace Impl {
/** \brief Time since construction */
class Timer {
private:
#ifdef KOKKOS_USE_LIBRT
struct timespec m_old;
#else
struct timeval m_old ;
#endif
Timer( const Timer & );
Timer & operator = ( const Timer & );
public:
inline
void reset() {
#ifdef KOKKOS_USE_LIBRT
clock_gettime(CLOCK_REALTIME, &m_old);
#else
gettimeofday( & m_old , ((struct timezone *) NULL ) );
#endif
}
inline
~Timer() {}
inline
Timer() { reset(); }
inline
double seconds() const
{
#ifdef KOKKOS_USE_LIBRT
struct timespec m_new;
clock_gettime(CLOCK_REALTIME, &m_new);
return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) +
( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
#else
struct timeval m_new ;
::gettimeofday( & m_new , ((struct timezone *) NULL ) );
return ( (double) ( m_new.tv_sec - m_old.tv_sec ) ) +
( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
#endif
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */

Some files were not shown because too many files have changed in this diff Show More