924 lines
32 KiB
C++
924 lines
32 KiB
C++
/*
|
|
//@HEADER
|
|
// ************************************************************************
|
|
//
|
|
// Kokkos v. 2.0
|
|
// Copyright (2014) Sandia Corporation
|
|
//
|
|
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
//
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
//
|
|
// 3. Neither the name of the Corporation nor the names of the
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
|
//
|
|
// ************************************************************************
|
|
//@HEADER
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdexcept>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
|
|
#include <Kokkos_Core.hpp>
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
namespace {
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct TestTeamPolicy {
|
|
|
|
typedef typename Kokkos::TeamPolicy< ScheduleType, ExecSpace >::member_type team_member ;
|
|
typedef Kokkos::View<int**,ExecSpace> view_type ;
|
|
|
|
view_type m_flags ;
|
|
|
|
TestTeamPolicy( const size_t league_size )
|
|
: m_flags( Kokkos::ViewAllocateWithoutInitializing("flags")
|
|
, Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( *this )
|
|
, league_size )
|
|
{}
|
|
|
|
struct VerifyInitTag {};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const team_member & member ) const
|
|
{
|
|
const int tid = member.team_rank() + member.team_size() * member.league_rank();
|
|
|
|
m_flags( member.team_rank() , member.league_rank() ) = tid ;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const VerifyInitTag & , const team_member & member ) const
|
|
{
|
|
const int tid = member.team_rank() + member.team_size() * member.league_rank();
|
|
|
|
if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) {
|
|
printf("TestTeamPolicy member(%d,%d) error %d != %d\n"
|
|
, member.league_rank() , member.team_rank()
|
|
, tid , m_flags( member.team_rank() , member.league_rank() ) );
|
|
}
|
|
}
|
|
|
|
// included for test_small_league_size
|
|
TestTeamPolicy()
|
|
: m_flags()
|
|
{}
|
|
|
|
// included for test_small_league_size
|
|
struct NoOpTag {} ;
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const NoOpTag & , const team_member & member ) const
|
|
{}
|
|
|
|
|
|
static void test_small_league_size() {
|
|
|
|
int bs = 8; // batch size (number of elements per batch)
|
|
int ns = 16; // total number of "problems" to process
|
|
|
|
// calculate total scratch memory space size
|
|
const int level = 0;
|
|
int mem_size = 960;
|
|
const int num_teams = ns/bs;
|
|
const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy(num_teams, Kokkos::AUTO());
|
|
|
|
Kokkos::parallel_for ( policy.set_scratch_size(level, Kokkos::PerTeam(mem_size), Kokkos::PerThread(0))
|
|
, TestTeamPolicy()
|
|
);
|
|
}
|
|
|
|
static void test_for( const size_t league_size )
|
|
{
|
|
TestTeamPolicy functor( league_size );
|
|
|
|
const int team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( functor );
|
|
|
|
Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size , team_size ) , functor );
|
|
Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
|
|
|
|
test_small_league_size();
|
|
}
|
|
|
|
struct ReduceTag {};
|
|
|
|
typedef long value_type ;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const team_member & member , value_type & update ) const
|
|
{
|
|
update += member.team_rank() + member.team_size() * member.league_rank();
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const ReduceTag & , const team_member & member , value_type & update ) const
|
|
{
|
|
update += 1 + member.team_rank() + member.team_size() * member.league_rank();
|
|
}
|
|
|
|
static void test_reduce( const size_t league_size )
|
|
{
|
|
TestTeamPolicy functor( league_size );
|
|
|
|
const int team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( functor );
|
|
const long N = team_size * league_size ;
|
|
|
|
long total = 0 ;
|
|
|
|
Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size , team_size ) , functor , total );
|
|
ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) );
|
|
|
|
Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace , ReduceTag >( league_size , team_size ) , functor , total );
|
|
ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) );
|
|
}
|
|
};
|
|
|
|
}
|
|
}
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template< typename ScalarType , class DeviceType, class ScheduleType >
|
|
class ReduceTeamFunctor
|
|
{
|
|
public:
|
|
typedef DeviceType execution_space ;
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
typedef typename execution_space::size_type size_type ;
|
|
|
|
struct value_type {
|
|
ScalarType value[3] ;
|
|
};
|
|
|
|
const size_type nwork ;
|
|
|
|
ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
|
|
|
|
ReduceTeamFunctor( const ReduceTeamFunctor & rhs )
|
|
: nwork( rhs.nwork ) {}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void init( value_type & dst ) const
|
|
{
|
|
dst.value[0] = 0 ;
|
|
dst.value[1] = 0 ;
|
|
dst.value[2] = 0 ;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join( volatile value_type & dst ,
|
|
const volatile value_type & src ) const
|
|
{
|
|
dst.value[0] += src.value[0] ;
|
|
dst.value[1] += src.value[1] ;
|
|
dst.value[2] += src.value[2] ;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const typename policy_type::member_type ind , value_type & dst ) const
|
|
{
|
|
const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
|
|
const int thread_size = ind.team_size() * ind.league_size();
|
|
const int chunk = ( nwork + thread_size - 1 ) / thread_size ;
|
|
|
|
size_type iwork = chunk * thread_rank ;
|
|
const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ;
|
|
|
|
for ( ; iwork < iwork_end ; ++iwork ) {
|
|
dst.value[0] += 1 ;
|
|
dst.value[1] += iwork + 1 ;
|
|
dst.value[2] += nwork - iwork ;
|
|
}
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
namespace {
|
|
|
|
template< typename ScalarType , class DeviceType, class ScheduleType >
|
|
class TestReduceTeam
|
|
{
|
|
public:
|
|
typedef DeviceType execution_space ;
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
typedef typename execution_space::size_type size_type ;
|
|
|
|
//------------------------------------
|
|
|
|
TestReduceTeam( const size_type & nwork )
|
|
{
|
|
run_test(nwork);
|
|
}
|
|
|
|
void run_test( const size_type & nwork )
|
|
{
|
|
typedef Test::ReduceTeamFunctor< ScalarType , execution_space , ScheduleType> functor_type ;
|
|
typedef typename functor_type::value_type value_type ;
|
|
typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ;
|
|
|
|
enum { Count = 3 };
|
|
enum { Repeat = 100 };
|
|
|
|
value_type result[ Repeat ];
|
|
|
|
const unsigned long nw = nwork ;
|
|
const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
|
|
: (nw/2) * ( nw + 1 );
|
|
|
|
const unsigned team_size = policy_type::team_size_recommended( functor_type(nwork) );
|
|
const unsigned league_size = ( nwork + team_size - 1 ) / team_size ;
|
|
|
|
policy_type team_exec( league_size , team_size );
|
|
|
|
for ( unsigned i = 0 ; i < Repeat ; ++i ) {
|
|
result_type tmp( & result[i] );
|
|
Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp );
|
|
}
|
|
|
|
execution_space::fence();
|
|
|
|
for ( unsigned i = 0 ; i < Repeat ; ++i ) {
|
|
for ( unsigned j = 0 ; j < Count ; ++j ) {
|
|
const unsigned long correct = 0 == j % 3 ? nw : nsum ;
|
|
ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template< class DeviceType, class ScheduleType >
|
|
class ScanTeamFunctor
|
|
{
|
|
public:
|
|
typedef DeviceType execution_space ;
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
|
|
typedef long int value_type ;
|
|
Kokkos::View< value_type , execution_space > accum ;
|
|
Kokkos::View< value_type , execution_space > total ;
|
|
|
|
ScanTeamFunctor() : accum("accum"), total("total") {}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void init( value_type & error ) const { error = 0 ; }
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join( value_type volatile & error ,
|
|
value_type volatile const & input ) const
|
|
{ if ( input ) error = 1 ; }
|
|
|
|
struct JoinMax {
|
|
typedef long int value_type ;
|
|
KOKKOS_INLINE_FUNCTION
|
|
void join( value_type volatile & dst
|
|
, value_type volatile const & input ) const
|
|
{ if ( dst < input ) dst = input ; }
|
|
};
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const typename policy_type::member_type ind , value_type & error ) const
|
|
{
|
|
if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
|
|
const long int thread_count = ind.league_size() * ind.team_size();
|
|
total() = ( thread_count * ( thread_count + 1 ) ) / 2 ;
|
|
}
|
|
|
|
// Team max:
|
|
const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() );
|
|
|
|
if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
|
|
printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n"
|
|
, ind.league_rank(), ind.team_rank()
|
|
, ind.league_size(), ind.team_size()
|
|
, (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m );
|
|
}
|
|
|
|
// Scan:
|
|
const long int answer =
|
|
( ind.league_rank() + 1 ) * ind.team_rank() +
|
|
( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ;
|
|
|
|
const long int result =
|
|
ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
|
|
|
|
const long int result2 =
|
|
ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
|
|
|
|
if ( answer != result || answer != result2 ) {
|
|
printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
|
|
ind.league_rank(), ind.team_rank(),
|
|
ind.league_size(), ind.team_size(),
|
|
answer,result,result2);
|
|
error = 1 ;
|
|
}
|
|
|
|
const long int thread_rank = ind.team_rank() +
|
|
ind.team_size() * ind.league_rank();
|
|
ind.team_scan( 1 + thread_rank , accum.ptr_on_device() );
|
|
}
|
|
};
|
|
|
|
template< class DeviceType, class ScheduleType >
|
|
class TestScanTeam
|
|
{
|
|
public:
|
|
typedef DeviceType execution_space ;
|
|
typedef long int value_type ;
|
|
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
typedef Test::ScanTeamFunctor<DeviceType, ScheduleType> functor_type ;
|
|
|
|
//------------------------------------
|
|
|
|
TestScanTeam( const size_t nteam )
|
|
{
|
|
run_test(nteam);
|
|
}
|
|
|
|
void run_test( const size_t nteam )
|
|
{
|
|
typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ;
|
|
const unsigned REPEAT = 100000 ;
|
|
unsigned Repeat;
|
|
if ( nteam == 0 )
|
|
{
|
|
Repeat = 1;
|
|
} else {
|
|
Repeat = ( REPEAT + nteam - 1 ) / nteam ; //error here
|
|
}
|
|
|
|
functor_type functor ;
|
|
|
|
policy_type team_exec( nteam , policy_type::team_size_max( functor ) );
|
|
|
|
for ( unsigned i = 0 ; i < Repeat ; ++i ) {
|
|
long int accum = 0 ;
|
|
long int total = 0 ;
|
|
long int error = 0 ;
|
|
Kokkos::deep_copy( functor.accum , total );
|
|
Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) );
|
|
DeviceType::fence();
|
|
Kokkos::deep_copy( accum , functor.accum );
|
|
Kokkos::deep_copy( total , functor.total );
|
|
|
|
ASSERT_EQ( error , 0 );
|
|
ASSERT_EQ( total , accum );
|
|
}
|
|
|
|
execution_space::fence();
|
|
}
|
|
};
|
|
|
|
} // namespace Test
|
|
|
|
/*--------------------------------------------------------------------------*/
|
|
|
|
namespace Test {
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct SharedTeamFunctor {
|
|
|
|
typedef ExecSpace execution_space ;
|
|
typedef int value_type ;
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
|
|
enum { SHARED_COUNT = 1000 };
|
|
|
|
typedef typename ExecSpace::scratch_memory_space shmem_space ;
|
|
|
|
// tbd: MemoryUnmanaged should be the default for shared memory space
|
|
typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
|
|
|
|
// Tell how much shared memory will be required by this functor:
|
|
inline
|
|
unsigned team_shmem_size( int team_size ) const
|
|
{
|
|
return shared_int_array_type::shmem_size( SHARED_COUNT ) +
|
|
shared_int_array_type::shmem_size( SHARED_COUNT );
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const typename policy_type::member_type & ind , value_type & update ) const
|
|
{
|
|
const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
|
|
const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
|
|
|
|
if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
|
|
(shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
|
|
printf ("Failed to allocate shared memory of size %lu\n",
|
|
static_cast<unsigned long> (SHARED_COUNT));
|
|
++update; // failure to allocate is an error
|
|
}
|
|
else {
|
|
for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
|
|
shared_A[i] = i + ind.league_rank();
|
|
shared_B[i] = 2 * i + ind.league_rank();
|
|
}
|
|
|
|
ind.team_barrier();
|
|
|
|
if ( ind.team_rank() + 1 == ind.team_size() ) {
|
|
for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
|
|
if ( shared_A[i] != i + ind.league_rank() ) {
|
|
++update ;
|
|
}
|
|
if ( shared_B[i] != 2 * i + ind.league_rank() ) {
|
|
++update ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
namespace {
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct TestSharedTeam {
|
|
|
|
TestSharedTeam()
|
|
{ run(); }
|
|
|
|
void run()
|
|
{
|
|
typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
|
|
typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ;
|
|
|
|
const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
|
|
|
|
Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size , team_size );
|
|
|
|
typename Functor::value_type error_count = 0 ;
|
|
|
|
Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) );
|
|
|
|
ASSERT_EQ( error_count , 0 );
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace Test {
|
|
|
|
#if defined (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
|
template< class MemorySpace, class ExecSpace, class ScheduleType >
|
|
struct TestLambdaSharedTeam {
|
|
|
|
TestLambdaSharedTeam()
|
|
{ run(); }
|
|
|
|
void run()
|
|
{
|
|
typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor ;
|
|
//typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ;
|
|
typedef Kokkos::View< typename Functor::value_type , MemorySpace, Kokkos::MemoryUnmanaged > result_type ;
|
|
|
|
typedef typename ExecSpace::scratch_memory_space shmem_space ;
|
|
|
|
// tbd: MemoryUnmanaged should be the default for shared memory space
|
|
typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
|
|
|
|
const int SHARED_COUNT = 1000;
|
|
int team_size = 1;
|
|
#ifdef KOKKOS_ENABLE_CUDA
|
|
if(std::is_same<ExecSpace,Kokkos::Cuda>::value)
|
|
team_size = 128;
|
|
#endif
|
|
Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size , team_size);
|
|
team_exec = team_exec.set_scratch_size(0,Kokkos::PerTeam(SHARED_COUNT*2*sizeof(int)));
|
|
|
|
typename Functor::value_type error_count = 0 ;
|
|
|
|
Kokkos::parallel_reduce( team_exec , KOKKOS_LAMBDA
|
|
( const typename Kokkos::TeamPolicy< ScheduleType, ExecSpace >::member_type & ind , int & update ) {
|
|
|
|
const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
|
|
const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
|
|
|
|
if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
|
|
(shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
|
|
printf ("Failed to allocate shared memory of size %lu\n",
|
|
static_cast<unsigned long> (SHARED_COUNT));
|
|
++update; // failure to allocate is an error
|
|
} else {
|
|
for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
|
|
shared_A[i] = i + ind.league_rank();
|
|
shared_B[i] = 2 * i + ind.league_rank();
|
|
}
|
|
|
|
ind.team_barrier();
|
|
|
|
if ( ind.team_rank() + 1 == ind.team_size() ) {
|
|
for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
|
|
if ( shared_A[i] != i + ind.league_rank() ) {
|
|
++update ;
|
|
}
|
|
if ( shared_B[i] != 2 * i + ind.league_rank() ) {
|
|
++update ;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}, result_type( & error_count ) );
|
|
|
|
ASSERT_EQ( error_count , 0 );
|
|
}
|
|
};
|
|
#endif
|
|
}
|
|
|
|
namespace Test {
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct ScratchTeamFunctor {
|
|
|
|
typedef ExecSpace execution_space ;
|
|
typedef int value_type ;
|
|
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type ;
|
|
|
|
enum { SHARED_TEAM_COUNT = 100 };
|
|
enum { SHARED_THREAD_COUNT = 10 };
|
|
|
|
typedef typename ExecSpace::scratch_memory_space shmem_space ;
|
|
|
|
// tbd: MemoryUnmanaged should be the default for shared memory space
|
|
typedef Kokkos::View<size_t*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator()( const typename policy_type::member_type & ind , value_type & update ) const
|
|
{
|
|
const shared_int_array_type scratch_ptr( ind.team_scratch(1) , 3*ind.team_size() );
|
|
const shared_int_array_type scratch_A( ind.team_scratch(1) , SHARED_TEAM_COUNT );
|
|
const shared_int_array_type scratch_B( ind.thread_scratch(1) , SHARED_THREAD_COUNT );
|
|
|
|
if ((scratch_ptr.ptr_on_device () == NULL ) ||
|
|
(scratch_A. ptr_on_device () == NULL && SHARED_TEAM_COUNT > 0) ||
|
|
(scratch_B. ptr_on_device () == NULL && SHARED_THREAD_COUNT > 0)) {
|
|
printf ("Failed to allocate shared memory of size %lu\n",
|
|
static_cast<unsigned long> (SHARED_TEAM_COUNT));
|
|
++update; // failure to allocate is an error
|
|
}
|
|
else {
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(ind,0,(int)SHARED_TEAM_COUNT),[&] (const int &i) {
|
|
scratch_A[i] = i + ind.league_rank();
|
|
});
|
|
for(int i=0; i<SHARED_THREAD_COUNT; i++)
|
|
scratch_B[i] = 10000*ind.league_rank() + 100*ind.team_rank() + i;
|
|
|
|
scratch_ptr[ind.team_rank()] = (size_t) scratch_A.ptr_on_device();
|
|
scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.ptr_on_device();
|
|
|
|
ind.team_barrier();
|
|
|
|
for( int i = 0; i<SHARED_TEAM_COUNT; i++) {
|
|
if(scratch_A[i] != size_t(i + ind.league_rank()))
|
|
++update;
|
|
}
|
|
for( int i = 0; i < ind.team_size(); i++) {
|
|
if(scratch_ptr[0]!=scratch_ptr[i]) ++update;
|
|
}
|
|
if(scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()] <
|
|
SHARED_THREAD_COUNT*sizeof(size_t))
|
|
++update;
|
|
for( int i = 1; i < ind.team_size(); i++) {
|
|
if((scratch_ptr[i+ind.team_size()] - scratch_ptr[i-1+ind.team_size()]) !=
|
|
(scratch_ptr[1+ind.team_size()] - scratch_ptr[0 + ind.team_size()])) ++update;
|
|
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
}
|
|
|
|
namespace {
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct TestScratchTeam {
|
|
|
|
TestScratchTeam()
|
|
{ run(); }
|
|
|
|
void run()
|
|
{
|
|
typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor ;
|
|
typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged > result_type ;
|
|
|
|
const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
|
|
|
|
Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size , team_size );
|
|
|
|
typename Functor::value_type error_count = 0 ;
|
|
|
|
int team_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_TEAM_COUNT) +
|
|
Functor::shared_int_array_type::shmem_size(3*team_size);
|
|
int thread_scratch_size = Functor::shared_int_array_type::shmem_size(Functor::SHARED_THREAD_COUNT);
|
|
Kokkos::parallel_reduce( team_exec.set_scratch_size(0,Kokkos::PerTeam(team_scratch_size),
|
|
Kokkos::PerThread(thread_scratch_size)) ,
|
|
Functor() , result_type( & error_count ) );
|
|
|
|
ASSERT_EQ( error_count , 0 );
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace Test {
|
|
template< class ExecSpace>
|
|
KOKKOS_INLINE_FUNCTION
|
|
int test_team_mulit_level_scratch_loop_body(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team1(team.team_scratch(0),128);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread1(team.thread_scratch(0),16);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team2(team.team_scratch(0),128);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread2(team.thread_scratch(0),16);
|
|
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team1(team.team_scratch(1),128000);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread1(team.thread_scratch(1),16000);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team2(team.team_scratch(1),128000);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread2(team.thread_scratch(1),16000);
|
|
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_team3(team.team_scratch(0),128);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> a_thread3(team.thread_scratch(0),16);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_team3(team.team_scratch(1),128000);
|
|
Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>> b_thread3(team.thread_scratch(1),16000);
|
|
|
|
// The explicit types for 0 and 128 are here to test TeamThreadRange accepting different
|
|
// types for begin and end.
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,int(0),unsigned(128)), [&] (const int& i)
|
|
{
|
|
a_team1(i) = 1000000 + i;
|
|
a_team2(i) = 2000000 + i;
|
|
a_team3(i) = 3000000 + i;
|
|
});
|
|
team.team_barrier();
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
|
|
{
|
|
a_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
|
|
a_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
|
|
a_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
|
|
});
|
|
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
|
|
{
|
|
b_team1(i) = 1000000 + i;
|
|
b_team2(i) = 2000000 + i;
|
|
b_team3(i) = 3000000 + i;
|
|
});
|
|
team.team_barrier();
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
|
|
{
|
|
b_thread1(i) = 1000000 + 100000*team.team_rank() + 16-i;
|
|
b_thread2(i) = 2000000 + 100000*team.team_rank() + 16-i;
|
|
b_thread3(i) = 3000000 + 100000*team.team_rank() + 16-i;
|
|
});
|
|
|
|
team.team_barrier();
|
|
int error = 0;
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128), [&] (const int& i)
|
|
{
|
|
if(a_team1(i) != 1000000 + i) error++;
|
|
if(a_team2(i) != 2000000 + i) error++;
|
|
if(a_team3(i) != 3000000 + i) error++;
|
|
});
|
|
team.team_barrier();
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16), [&] (const int& i)
|
|
{
|
|
if(a_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
|
|
if(a_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
|
|
if(a_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
|
|
});
|
|
|
|
Kokkos::parallel_for(Kokkos::TeamThreadRange(team,0,128000), [&] (const int& i)
|
|
{
|
|
if(b_team1(i) != 1000000 + i) error++;
|
|
if(b_team2(i) != 2000000 + i) error++;
|
|
if(b_team3(i) != 3000000 + i) error++;
|
|
});
|
|
team.team_barrier();
|
|
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,16000), [&] (const int& i)
|
|
{
|
|
if(b_thread1(i) != 1000000 + 100000*team.team_rank() + 16-i) error++;
|
|
if(b_thread2(i) != 2000000 + 100000*team.team_rank() + 16-i) error++;
|
|
if( b_thread3(i) != 3000000 + 100000*team.team_rank() + 16-i) error++;
|
|
});
|
|
|
|
return error;
|
|
}
|
|
|
|
struct TagReduce {};
|
|
struct TagFor {};
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct ClassNoShmemSizeFunction {
|
|
Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
|
|
error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
}
|
|
|
|
void run() {
|
|
Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
|
|
const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
|
|
const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
|
|
{
|
|
Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
|
|
Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
*this);
|
|
Kokkos::fence();
|
|
typename Kokkos::View<int,ExecSpace>::HostMirror h_errors = Kokkos::create_mirror_view(d_errors);
|
|
Kokkos::deep_copy(h_errors,d_errors);
|
|
ASSERT_EQ(h_errors(),0);
|
|
}
|
|
|
|
{
|
|
int error = 0;
|
|
Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
|
|
Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
*this,error);
|
|
Kokkos::fence();
|
|
ASSERT_EQ(error,0);
|
|
}
|
|
};
|
|
};
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct ClassWithShmemSizeFunction {
|
|
Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator() (const TagFor&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team) const {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
}
|
|
|
|
KOKKOS_INLINE_FUNCTION
|
|
void operator() (const TagReduce&, const typename Kokkos::TeamPolicy<ExecSpace,ScheduleType>::member_type& team, int& error) const {
|
|
error += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
}
|
|
|
|
void run() {
|
|
Kokkos::View<int,ExecSpace> d_errors = Kokkos::View<int,ExecSpace>("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
|
|
const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
|
|
{
|
|
Kokkos::TeamPolicy<TagFor,ExecSpace,ScheduleType> policy(10,8,16);
|
|
Kokkos::parallel_for(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
*this);
|
|
Kokkos::fence();
|
|
typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(d_errors);
|
|
Kokkos::deep_copy(h_errors,d_errors);
|
|
ASSERT_EQ(h_errors(),0);
|
|
}
|
|
|
|
{
|
|
int error = 0;
|
|
Kokkos::TeamPolicy<TagReduce,ExecSpace,ScheduleType> policy(10,8,16);
|
|
Kokkos::parallel_reduce(policy.set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
*this,error);
|
|
Kokkos::fence();
|
|
ASSERT_EQ(error,0);
|
|
}
|
|
};
|
|
|
|
unsigned team_shmem_size(int team_size) const {
|
|
const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
return per_team0 + team_size * per_thread0;
|
|
}
|
|
};
|
|
|
|
template< class ExecSpace, class ScheduleType >
|
|
void test_team_mulit_level_scratch_test_lambda() {
|
|
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
|
Kokkos::View<int,ExecSpace,Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
|
|
Kokkos::View<int,ExecSpace> d_errors("Errors");
|
|
errors = d_errors;
|
|
|
|
const int per_team0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128);
|
|
const int per_thread0 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16);
|
|
|
|
const int per_team1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(128000);
|
|
const int per_thread1 = 3*Kokkos::View<double*,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged>>::shmem_size(16000);
|
|
|
|
Kokkos::TeamPolicy<ExecSpace,ScheduleType> policy(10,8,16);
|
|
Kokkos::parallel_for(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team) {
|
|
int error = test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
errors() += error;
|
|
});
|
|
Kokkos::fence();
|
|
typename Kokkos::View<int,ExecSpace>::HostMirror h_errors= Kokkos::create_mirror_view(errors);
|
|
Kokkos::deep_copy(h_errors,d_errors);
|
|
ASSERT_EQ(h_errors(),0);
|
|
|
|
int error = 0;
|
|
Kokkos::parallel_reduce(policy.set_scratch_size(0,Kokkos::PerTeam(per_team0),Kokkos::PerThread(per_thread0)).set_scratch_size(1,Kokkos::PerTeam(per_team1),Kokkos::PerThread(per_thread1)),
|
|
KOKKOS_LAMBDA(const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team, int& count) {
|
|
count += test_team_mulit_level_scratch_loop_body<ExecSpace>(team);
|
|
},error);
|
|
ASSERT_EQ(error,0);
|
|
Kokkos::fence();
|
|
#endif
|
|
}
|
|
|
|
|
|
}
|
|
|
|
namespace {
|
|
template< class ExecSpace, class ScheduleType >
|
|
struct TestMultiLevelScratchTeam {
|
|
|
|
TestMultiLevelScratchTeam()
|
|
{ run(); }
|
|
|
|
void run()
|
|
{
|
|
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
|
Test::test_team_mulit_level_scratch_test_lambda<ExecSpace, ScheduleType>();
|
|
#endif
|
|
Test::ClassNoShmemSizeFunction<ExecSpace, ScheduleType> c1;
|
|
c1.run();
|
|
|
|
Test::ClassWithShmemSizeFunction<ExecSpace, ScheduleType> c2;
|
|
c2.run();
|
|
|
|
}
|
|
};
|
|
}
|
|
|
|
namespace Test {
|
|
|
|
template< class ExecSpace >
|
|
struct TestShmemSize {
|
|
|
|
TestShmemSize() { run(); }
|
|
|
|
void run()
|
|
{
|
|
typedef Kokkos::View< long***, ExecSpace > view_type;
|
|
|
|
size_t d1 = 5;
|
|
size_t d2 = 6;
|
|
size_t d3 = 7;
|
|
|
|
size_t size = view_type::shmem_size( d1, d2, d3 );
|
|
|
|
ASSERT_EQ( size, d1 * d2 * d3 * sizeof(long) );
|
|
}
|
|
};
|
|
}
|
|
|
|
/*--------------------------------------------------------------------------*/
|