Files
lammps/lib/kokkos/core/unit_test/TestTeam.hpp
2018-03-08 10:57:08 -07:00

973 lines
35 KiB
C++

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cstdio>
#include <stdexcept>
#include <sstream>
#include <iostream>
#include <Kokkos_Core.hpp>
namespace Test {
namespace {
template< class ExecSpace, class ScheduleType >
struct TestTeamPolicy {
typedef typename Kokkos::TeamPolicy< ScheduleType, ExecSpace >::member_type team_member;
typedef Kokkos::View< int**, ExecSpace > view_type;
view_type m_flags;
TestTeamPolicy( const size_t league_size )
: m_flags( Kokkos::ViewAllocateWithoutInitializing( "flags" ),
Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( *this ),
league_size ) {}
struct VerifyInitTag {};
KOKKOS_INLINE_FUNCTION
void operator()( const team_member & member ) const
{
const int tid = member.team_rank() + member.team_size() * member.league_rank();
m_flags( member.team_rank(), member.league_rank() ) = tid;
}
KOKKOS_INLINE_FUNCTION
void operator()( const VerifyInitTag &, const team_member & member ) const
{
const int tid = member.team_rank() + member.team_size() * member.league_rank();
if ( tid != m_flags( member.team_rank(), member.league_rank() ) ) {
printf( "TestTeamPolicy member(%d,%d) error %d != %d\n",
member.league_rank(), member.team_rank(),
tid, m_flags( member.team_rank(), member.league_rank() ) );
}
}
// Included for test_small_league_size.
TestTeamPolicy() : m_flags() {}
// Included for test_small_league_size.
struct NoOpTag {};
KOKKOS_INLINE_FUNCTION
void operator()( const NoOpTag &, const team_member & member ) const {}
static void test_small_league_size() {
int bs = 8; // batch size (number of elements per batch)
int ns = 16; // total number of "problems" to process
// Calculate total scratch memory space size.
const int level = 0;
int mem_size = 960;
const int num_teams = ns / bs;
const Kokkos::TeamPolicy< ExecSpace, NoOpTag > policy( num_teams, Kokkos::AUTO() );
Kokkos::parallel_for( policy.set_scratch_size( level, Kokkos::PerTeam( mem_size ), Kokkos::PerThread( 0 ) ),
TestTeamPolicy() );
}
static void test_for( const size_t league_size )
{
TestTeamPolicy functor( league_size );
const int team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( functor );
Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size, team_size ), functor );
Kokkos::parallel_for( Kokkos::TeamPolicy< ScheduleType, ExecSpace, VerifyInitTag >( league_size, team_size ), functor );
test_small_league_size();
}
struct ReduceTag {};
typedef long value_type;
KOKKOS_INLINE_FUNCTION
void operator()( const team_member & member, value_type & update ) const
{
update += member.team_rank() + member.team_size() * member.league_rank();
}
KOKKOS_INLINE_FUNCTION
void operator()( const ReduceTag &, const team_member & member, value_type & update ) const
{
update += 1 + member.team_rank() + member.team_size() * member.league_rank();
}
static void test_reduce( const size_t league_size )
{
TestTeamPolicy functor( league_size );
const int team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( functor );
const long N = team_size * league_size;
long total = 0;
Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace >( league_size, team_size ), functor, total );
ASSERT_EQ( size_t( ( N - 1 ) * ( N ) ) / 2, size_t( total ) );
Kokkos::parallel_reduce( Kokkos::TeamPolicy< ScheduleType, ExecSpace, ReduceTag >( league_size, team_size ), functor, total );
ASSERT_EQ( ( size_t( N ) * size_t( N + 1 ) ) / 2, size_t( total ) );
}
};
} // namespace
} // namespace Test
/*--------------------------------------------------------------------------*/
namespace Test {
template< typename ScalarType, class DeviceType, class ScheduleType >
class ReduceTeamFunctor
{
public:
typedef DeviceType execution_space;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
typedef typename execution_space::size_type size_type;
struct value_type {
ScalarType value[3];
};
const size_type nwork;
ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
ReduceTeamFunctor( const ReduceTeamFunctor & rhs ) : nwork( rhs.nwork ) {}
KOKKOS_INLINE_FUNCTION
void init( value_type & dst ) const
{
dst.value[0] = 0;
dst.value[1] = 0;
dst.value[2] = 0;
}
KOKKOS_INLINE_FUNCTION
void join( volatile value_type & dst, const volatile value_type & src ) const
{
dst.value[0] += src.value[0];
dst.value[1] += src.value[1];
dst.value[2] += src.value[2];
}
KOKKOS_INLINE_FUNCTION
void operator()( const typename policy_type::member_type ind, value_type & dst ) const
{
const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
const int thread_size = ind.team_size() * ind.league_size();
const int chunk = ( nwork + thread_size - 1 ) / thread_size;
size_type iwork = chunk * thread_rank;
const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork;
for ( ; iwork < iwork_end; ++iwork ) {
dst.value[0] += 1;
dst.value[1] += iwork + 1;
dst.value[2] += nwork - iwork;
}
}
};
} // namespace Test
namespace {
template< typename ScalarType, class DeviceType, class ScheduleType >
class TestReduceTeam
{
public:
typedef DeviceType execution_space;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
typedef typename execution_space::size_type size_type;
TestReduceTeam( const size_type & nwork ) { run_test( nwork ); }
void run_test( const size_type & nwork )
{
typedef Test::ReduceTeamFunctor< ScalarType, execution_space, ScheduleType> functor_type;
typedef typename functor_type::value_type value_type;
typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
enum { Count = 3 };
enum { Repeat = 100 };
value_type result[ Repeat ];
const unsigned long nw = nwork;
const unsigned long nsum = nw % 2 ? nw * ( ( nw + 1 ) / 2 )
: ( nw / 2 ) * ( nw + 1 );
const unsigned team_size = policy_type::team_size_recommended( functor_type( nwork ) );
const unsigned league_size = ( nwork + team_size - 1 ) / team_size;
policy_type team_exec( league_size, team_size );
for ( unsigned i = 0; i < Repeat; ++i ) {
result_type tmp( & result[i] );
Kokkos::parallel_reduce( team_exec, functor_type( nwork ), tmp );
}
execution_space::fence();
for ( unsigned i = 0; i < Repeat; ++i ) {
for ( unsigned j = 0; j < Count; ++j ) {
const unsigned long correct = 0 == j % 3 ? nw : nsum;
ASSERT_EQ( (ScalarType) correct, result[i].value[j] );
}
}
}
};
} // namespace
/*--------------------------------------------------------------------------*/
namespace Test {
template< class DeviceType, class ScheduleType >
class ScanTeamFunctor
{
public:
typedef DeviceType execution_space;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
typedef long int value_type;
Kokkos::View< value_type, execution_space > accum;
Kokkos::View< value_type, execution_space > total;
ScanTeamFunctor() : accum( "accum" ), total( "total" ) {}
KOKKOS_INLINE_FUNCTION
void init( value_type & error ) const { error = 0; }
KOKKOS_INLINE_FUNCTION
void join( value_type volatile & error, value_type volatile const & input ) const
{ if ( input ) error = 1; }
struct JoinMax {
typedef long int value_type;
KOKKOS_INLINE_FUNCTION
void join( value_type volatile & dst, value_type volatile const & input ) const
{ if ( dst < input ) dst = input; }
};
KOKKOS_INLINE_FUNCTION
void operator()( const typename policy_type::member_type ind, value_type & error ) const
{
if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
const long int thread_count = ind.league_size() * ind.team_size();
total() = ( thread_count * ( thread_count + 1 ) ) / 2;
}
// Team max:
int long m = (long int) ( ind.league_rank() + ind.team_rank() );
ind.team_reduce( Kokkos::Experimental::Max<int long>(m) );
if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
printf( "ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n",
ind.league_rank(), ind.team_rank(),
ind.league_size(), ind.team_size(),
(long int) ( ind.league_rank() + ( ind.team_size() - 1 ) ), m );
}
// Scan:
const long int answer =
( ind.league_rank() + 1 ) * ind.team_rank() + ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2;
const long int result =
ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
const long int result2 =
ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
if ( answer != result || answer != result2 ) {
printf( "ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
ind.league_rank(), ind.team_rank(),
ind.league_size(), ind.team_size(),
answer, result, result2 );
error = 1;
}
const long int thread_rank = ind.team_rank() +
ind.team_size() * ind.league_rank();
ind.team_scan( 1 + thread_rank, accum.data() );
}
};
template< class DeviceType, class ScheduleType >
class TestScanTeam
{
public:
typedef DeviceType execution_space;
typedef long int value_type;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
typedef Test::ScanTeamFunctor<DeviceType, ScheduleType> functor_type;
TestScanTeam( const size_t nteam ) { run_test( nteam ); }
void run_test( const size_t nteam )
{
typedef Kokkos::View< long int, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
const unsigned REPEAT = 100000;
unsigned Repeat;
if ( nteam == 0 ) {
Repeat = 1;
}
else {
Repeat = ( REPEAT + nteam - 1 ) / nteam; // Error here.
}
functor_type functor;
policy_type team_exec( nteam, policy_type::team_size_max( functor ) );
for ( unsigned i = 0; i < Repeat; ++i ) {
long int accum = 0;
long int total = 0;
long int error = 0;
Kokkos::deep_copy( functor.accum, total );
Kokkos::parallel_reduce( team_exec, functor, result_type( & error ) );
DeviceType::fence();
Kokkos::deep_copy( accum, functor.accum );
Kokkos::deep_copy( total, functor.total );
ASSERT_EQ( error, 0 );
ASSERT_EQ( total, accum );
}
execution_space::fence();
}
};
} // namespace Test
/*--------------------------------------------------------------------------*/
namespace Test {
template< class ExecSpace, class ScheduleType >
struct SharedTeamFunctor {
typedef ExecSpace execution_space;
typedef int value_type;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
enum { SHARED_COUNT = 1000 };
typedef typename ExecSpace::scratch_memory_space shmem_space;
// TBD: MemoryUnmanaged should be the default for shared memory space.
typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
// Tell how much shared memory will be required by this functor.
inline
unsigned team_shmem_size( int team_size ) const
{
return shared_int_array_type::shmem_size( SHARED_COUNT ) +
shared_int_array_type::shmem_size( SHARED_COUNT );
}
KOKKOS_INLINE_FUNCTION
void operator()( const typename policy_type::member_type & ind, value_type & update ) const
{
const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
if ( ( shared_A.data() == nullptr && SHARED_COUNT > 0 ) ||
( shared_B.data() == nullptr && SHARED_COUNT > 0 ) )
{
printf ("member( %d/%d , %d/%d ) Failed to allocate shared memory of size %lu\n"
, ind.league_rank()
, ind.league_size()
, ind.team_rank()
, ind.team_size()
, static_cast<unsigned long>( SHARED_COUNT )
);
++update; // Failure to allocate is an error.
}
else {
for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
shared_A[i] = i + ind.league_rank();
shared_B[i] = 2 * i + ind.league_rank();
}
ind.team_barrier();
if ( ind.team_rank() + 1 == ind.team_size() ) {
for ( int i = 0; i < SHARED_COUNT; ++i ) {
if ( shared_A[i] != i + ind.league_rank() ) {
++update;
}
if ( shared_B[i] != 2 * i + ind.league_rank() ) {
++update;
}
}
}
}
}
};
} // namespace Test
namespace {
template< class ExecSpace, class ScheduleType >
struct TestSharedTeam {
TestSharedTeam() { run(); }
void run()
{
typedef Test::SharedTeamFunctor<ExecSpace, ScheduleType> Functor;
typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
const size_t team_size = Kokkos::TeamPolicy< ScheduleType, ExecSpace >::team_size_max( Functor() );
Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size, team_size );
typename Functor::value_type error_count = 0;
Kokkos::parallel_reduce( team_exec, Functor(), result_type( & error_count ) );
ASSERT_EQ( error_count, 0 );
}
};
} // namespace
namespace Test {
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
template< class MemorySpace, class ExecSpace, class ScheduleType >
struct TestLambdaSharedTeam {
TestLambdaSharedTeam() { run(); }
void run()
{
typedef Test::SharedTeamFunctor< ExecSpace, ScheduleType > Functor;
//typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
typedef Kokkos::View< typename Functor::value_type, MemorySpace, Kokkos::MemoryUnmanaged > result_type;
typedef typename ExecSpace::scratch_memory_space shmem_space;
// TBD: MemoryUnmanaged should be the default for shared memory space.
typedef Kokkos::View< int*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
const int SHARED_COUNT = 1000;
int team_size = 1;
#ifdef KOKKOS_ENABLE_CUDA
if ( std::is_same< ExecSpace, Kokkos::Cuda >::value ) team_size = 128;
#endif
Kokkos::TeamPolicy< ScheduleType, ExecSpace > team_exec( 8192 / team_size, team_size );
team_exec = team_exec.set_scratch_size( 0, Kokkos::PerTeam( SHARED_COUNT * 2 * sizeof( int ) ) );
typename Functor::value_type error_count = 0;
Kokkos::parallel_reduce( team_exec, KOKKOS_LAMBDA
( const typename Kokkos::TeamPolicy< ScheduleType, ExecSpace >::member_type & ind, int & update )
{
const shared_int_array_type shared_A( ind.team_shmem(), SHARED_COUNT );
const shared_int_array_type shared_B( ind.team_shmem(), SHARED_COUNT );
if ( ( shared_A.data () == nullptr && SHARED_COUNT > 0 ) ||
( shared_B.data () == nullptr && SHARED_COUNT > 0 ) )
{
printf( "Failed to allocate shared memory of size %lu\n",
static_cast<unsigned long>( SHARED_COUNT ) );
++update; // Failure to allocate is an error.
}
else {
for ( int i = ind.team_rank(); i < SHARED_COUNT; i += ind.team_size() ) {
shared_A[i] = i + ind.league_rank();
shared_B[i] = 2 * i + ind.league_rank();
}
ind.team_barrier();
if ( ind.team_rank() + 1 == ind.team_size() ) {
for ( int i = 0; i < SHARED_COUNT; ++i ) {
if ( shared_A[i] != i + ind.league_rank() ) {
++update;
}
if ( shared_B[i] != 2 * i + ind.league_rank() ) {
++update;
}
}
}
}
}, result_type( & error_count ) );
ASSERT_EQ( error_count, 0 );
}
};
#endif
#endif
} // namespace Test
namespace Test {
template< class ExecSpace, class ScheduleType >
struct ScratchTeamFunctor {
typedef ExecSpace execution_space;
typedef int value_type;
typedef Kokkos::TeamPolicy< ScheduleType, execution_space > policy_type;
enum { SHARED_TEAM_COUNT = 100 };
enum { SHARED_THREAD_COUNT = 10 };
typedef typename ExecSpace::scratch_memory_space shmem_space;
// TBD: MemoryUnmanaged should be the default for shared memory space.
typedef Kokkos::View< size_t*, shmem_space, Kokkos::MemoryUnmanaged > shared_int_array_type;
KOKKOS_INLINE_FUNCTION
void operator()( const typename policy_type::member_type & ind, value_type & update ) const
{
const shared_int_array_type scratch_ptr( ind.team_scratch( 1 ), 3 * ind.team_size() );
const shared_int_array_type scratch_A( ind.team_scratch( 1 ), SHARED_TEAM_COUNT );
const shared_int_array_type scratch_B( ind.thread_scratch( 1 ), SHARED_THREAD_COUNT );
if ( ( scratch_ptr.data() == nullptr ) ||
( scratch_A. data() == nullptr && SHARED_TEAM_COUNT > 0 ) ||
( scratch_B. data() == nullptr && SHARED_THREAD_COUNT > 0 ) )
{
printf( "Failed to allocate shared memory of size %lu\n",
static_cast<unsigned long>( SHARED_TEAM_COUNT ) );
++update; // Failure to allocate is an error.
}
else {
Kokkos::parallel_for( Kokkos::TeamThreadRange( ind, 0, (int) SHARED_TEAM_COUNT ), [&] ( const int & i ) {
scratch_A[i] = i + ind.league_rank();
});
for ( int i = 0; i < SHARED_THREAD_COUNT; i++ ) {
scratch_B[i] = 10000 * ind.league_rank() + 100 * ind.team_rank() + i;
}
scratch_ptr[ind.team_rank()] = (size_t) scratch_A.data();
scratch_ptr[ind.team_rank() + ind.team_size()] = (size_t) scratch_B.data();
ind.team_barrier();
for ( int i = 0; i < SHARED_TEAM_COUNT; i++ ) {
if ( scratch_A[i] != size_t( i + ind.league_rank() ) ) ++update;
}
for ( int i = 0; i < ind.team_size(); i++ ) {
if ( scratch_ptr[0] != scratch_ptr[i] ) ++update;
}
if ( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] < SHARED_THREAD_COUNT * sizeof( size_t ) ) {
++update;
}
for ( int i = 1; i < ind.team_size(); i++ ) {
if ( ( scratch_ptr[i + ind.team_size()] - scratch_ptr[i - 1 + ind.team_size()] ) !=
( scratch_ptr[1 + ind.team_size()] - scratch_ptr[0 + ind.team_size()] ) )
{
++update;
}
}
}
}
};
} // namespace Test
namespace {
template< class ExecSpace, class ScheduleType >
struct TestScratchTeam {
TestScratchTeam() { run(); }
void run()
{
typedef Test::ScratchTeamFunctor<ExecSpace, ScheduleType> Functor;
typedef Kokkos::View< typename Functor::value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type;
typedef Kokkos::TeamPolicy< ScheduleType, ExecSpace > p_type;
const size_t team_size = p_type::team_size_max( Functor() );
p_type team_exec( 8192 / team_size, team_size );
typename Functor::value_type error_count = 0;
int team_scratch_size = Functor::shared_int_array_type::shmem_size( Functor::SHARED_TEAM_COUNT ) +
Functor::shared_int_array_type::shmem_size( 3 * team_size );
int thread_scratch_size = Functor::shared_int_array_type::shmem_size( Functor::SHARED_THREAD_COUNT );
Kokkos::parallel_reduce( team_exec.set_scratch_size( 1, Kokkos::PerTeam( team_scratch_size ),
Kokkos::PerThread( thread_scratch_size ) ),
Functor(), result_type( & error_count ) );
ASSERT_EQ( error_count, 0 );
Kokkos::parallel_reduce( p_type( 8192 / team_size, team_size ,
Kokkos::ScratchRequest( 1, Kokkos::PerTeam( team_scratch_size ),
Kokkos::PerThread( thread_scratch_size ))
),
Functor(), result_type( & error_count ) );
ASSERT_EQ( error_count, 0 );
}
};
} // namespace
namespace Test {
template< class ExecSpace >
KOKKOS_INLINE_FUNCTION
int test_team_mulit_level_scratch_loop_body( const typename Kokkos::TeamPolicy<ExecSpace>::member_type& team ) {
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team1( team.team_scratch( 0 ), 128 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread1( team.thread_scratch( 0 ), 16 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team2( team.team_scratch( 0 ), 128 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread2( team.thread_scratch( 0 ), 16 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team1( team.team_scratch( 1 ), 128000 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread1( team.thread_scratch( 1 ), 16000 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team2( team.team_scratch( 1 ), 128000 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread2( team.thread_scratch( 1 ), 16000 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_team3( team.team_scratch( 0 ), 128 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > a_thread3( team.thread_scratch( 0 ), 16 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_team3( team.team_scratch( 1 ), 128000 );
Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > b_thread3( team.thread_scratch( 1 ), 16000 );
// The explicit types for 0 and 128 are here to test TeamThreadRange accepting different
// types for begin and end.
Kokkos::parallel_for( Kokkos::TeamThreadRange( team, int( 0 ), unsigned( 128 ) ), [&] ( const int & i )
{
a_team1( i ) = 1000000 + i + team.league_rank() * 100000;
a_team2( i ) = 2000000 + i + team.league_rank() * 100000;
a_team3( i ) = 3000000 + i + team.league_rank() * 100000;
});
team.team_barrier();
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
{
a_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
a_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
a_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
});
Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
{
b_team1( i ) = 1000000 + i + team.league_rank() * 100000;
b_team2( i ) = 2000000 + i + team.league_rank() * 100000;
b_team3( i ) = 3000000 + i + team.league_rank() * 100000;
});
team.team_barrier();
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
{
b_thread1( i ) = 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
b_thread2( i ) = 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
b_thread3( i ) = 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000;
});
team.team_barrier();
int error = 0;
Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128 ), [&] ( const int & i )
{
if ( a_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
if ( a_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
if ( a_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
});
team.team_barrier();
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16 ), [&] ( const int & i )
{
if ( a_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
if ( a_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
if ( a_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
});
Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, 128000 ), [&] ( const int & i )
{
if ( b_team1( i ) != 1000000 + i + team.league_rank() * 100000 ) error++;
if ( b_team2( i ) != 2000000 + i + team.league_rank() * 100000 ) error++;
if ( b_team3( i ) != 3000000 + i + team.league_rank() * 100000 ) error++;
});
team.team_barrier();
Kokkos::parallel_for( Kokkos::ThreadVectorRange( team, 16000 ), [&] ( const int & i )
{
if ( b_thread1( i ) != 1000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
if ( b_thread2( i ) != 2000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
if ( b_thread3( i ) != 3000000 + 100000 * team.team_rank() + 16 - i + team.league_rank() * 100000 ) error++;
});
return error;
}
struct TagReduce {};
struct TagFor {};
template< class ExecSpace, class ScheduleType >
struct ClassNoShmemSizeFunction {
typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
KOKKOS_INLINE_FUNCTION
void operator()( const TagFor &, const member_type & team ) const {
int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
errors() += error;
}
KOKKOS_INLINE_FUNCTION
void operator() ( const TagReduce &, const member_type & team, int & error ) const {
error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
}
void run() {
Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
errors = d_errors;
const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
{
Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this );
Kokkos::fence();
typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
Kokkos::deep_copy( h_errors, d_errors );
ASSERT_EQ( h_errors(), 0 );
}
{
int error = 0;
Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ), *this, error );
Kokkos::fence();
ASSERT_EQ( error, 0 );
}
};
};
template< class ExecSpace, class ScheduleType >
struct ClassWithShmemSizeFunction {
typedef typename Kokkos::TeamPolicy< ExecSpace, ScheduleType >::member_type member_type;
Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
KOKKOS_INLINE_FUNCTION
void operator()( const TagFor &, const member_type & team ) const {
int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
errors() += error;
}
KOKKOS_INLINE_FUNCTION
void operator() ( const TagReduce &, const member_type & team, int & error ) const {
error += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
}
void run() {
Kokkos::View< int, ExecSpace > d_errors = Kokkos::View< int, ExecSpace >( "Errors" );
errors = d_errors;
const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
{
Kokkos::TeamPolicy< TagFor, ExecSpace, ScheduleType > policy( 10, 8, 16 );
Kokkos::parallel_for( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
Kokkos::PerThread( per_thread1 ) ),
*this );
Kokkos::fence();
typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( d_errors );
Kokkos::deep_copy( h_errors, d_errors );
ASSERT_EQ( h_errors(), 0 );
}
{
int error = 0;
Kokkos::TeamPolicy< TagReduce, ExecSpace, ScheduleType > policy( 10, 8, 16 );
Kokkos::parallel_reduce( policy.set_scratch_size( 1, Kokkos::PerTeam( per_team1 ),
Kokkos::PerThread( per_thread1 ) ),
*this, error );
Kokkos::fence();
ASSERT_EQ( error, 0 );
}
};
unsigned team_shmem_size( int team_size ) const {
const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
return per_team0 + team_size * per_thread0;
}
};
template< class ExecSpace, class ScheduleType >
void test_team_mulit_level_scratch_test_lambda() {
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
#if !defined(KOKKOS_ENABLE_CUDA) || ( 8000 <= CUDA_VERSION )
Kokkos::View< int, ExecSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > errors;
Kokkos::View< int, ExecSpace > d_errors( "Errors" );
errors = d_errors;
const int per_team0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128 );
const int per_thread0 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16 );
const int per_team1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 128000 );
const int per_thread1 = 3 * Kokkos::View< double*, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> >::shmem_size( 16000 );
Kokkos::TeamPolicy< ExecSpace, ScheduleType > policy( 10, 8, 16 );
Kokkos::parallel_for( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team )
{
int error = test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
errors() += error;
});
Kokkos::fence();
typename Kokkos::View< int, ExecSpace >::HostMirror h_errors = Kokkos::create_mirror_view( errors );
Kokkos::deep_copy( h_errors, d_errors );
ASSERT_EQ( h_errors(), 0 );
int error = 0;
Kokkos::parallel_reduce( policy.set_scratch_size( 0, Kokkos::PerTeam( per_team0 ), Kokkos::PerThread( per_thread0 ) ).set_scratch_size( 1, Kokkos::PerTeam( per_team1 ), Kokkos::PerThread( per_thread1 ) ),
KOKKOS_LAMBDA ( const typename Kokkos::TeamPolicy< ExecSpace >::member_type & team, int & count )
{
count += test_team_mulit_level_scratch_loop_body< ExecSpace >( team );
}, error );
ASSERT_EQ( error, 0 );
Kokkos::fence();
#endif
#endif
}
} // namespace Test
namespace {
template< class ExecSpace, class ScheduleType >
struct TestMultiLevelScratchTeam {
TestMultiLevelScratchTeam() { run(); }
void run()
{
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
Test::test_team_mulit_level_scratch_test_lambda< ExecSpace, ScheduleType >();
#endif
Test::ClassNoShmemSizeFunction< ExecSpace, ScheduleType > c1;
c1.run();
Test::ClassWithShmemSizeFunction< ExecSpace, ScheduleType > c2;
c2.run();
}
};
} // namespace
namespace Test {
template< class ExecSpace >
struct TestShmemSize {
TestShmemSize() { run(); }
void run()
{
typedef Kokkos::View< long***, ExecSpace > view_type;
size_t d1 = 5;
size_t d2 = 6;
size_t d3 = 7;
size_t size = view_type::shmem_size( d1, d2, d3 );
ASSERT_EQ( size, d1 * d2 * d3 * sizeof( long ) );
test_layout_stride();
}
void test_layout_stride()
{
int rank = 3;
int order[3] = {2, 0, 1};
int extents[3] = {100, 10, 3};
auto s1 = Kokkos::View<double***, Kokkos::LayoutStride, ExecSpace>::shmem_size(Kokkos::LayoutStride::order_dimensions(rank, order, extents));
auto s2 = Kokkos::View<double***, Kokkos::LayoutRight, ExecSpace>::shmem_size(extents[0], extents[1], extents[2]);
ASSERT_EQ(s1, s2);
}
};
} // namespace Test