Files
lammps/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
2017-11-06 13:47:33 -07:00

467 lines
17 KiB
C++

/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#include <initializer_list>
#include<impl/KokkosExp_Host_IterateTile.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
#endif
namespace Kokkos {
// ------------------------------------------------------------------ //
enum class Iterate
{
Default, // Default for the device
Left, // Left indices stride fastest
Right, // Right indices stride fastest
};
template <typename ExecSpace>
struct default_outer_direction
{
using type = Iterate;
#if defined( KOKKOS_ENABLE_CUDA)
static constexpr Iterate value = Iterate::Left;
#else
static constexpr Iterate value = Iterate::Right;
#endif
};
template <typename ExecSpace>
struct default_inner_direction
{
using type = Iterate;
#if defined( KOKKOS_ENABLE_CUDA)
static constexpr Iterate value = Iterate::Left;
#else
static constexpr Iterate value = Iterate::Right;
#endif
};
// Iteration Pattern
template < unsigned N
, Iterate OuterDir = Iterate::Default
, Iterate InnerDir = Iterate::Default
>
struct Rank
{
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
static constexpr int rank = N;
static constexpr Iterate outer_direction = OuterDir;
static constexpr Iterate inner_direction = InnerDir;
};
// multi-dimensional iteration pattern
template <typename... Properties>
struct MDRangePolicy
: public Kokkos::Impl::PolicyTraits<Properties ...>
{
using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
using range_policy = RangePolicy<Properties...>;
using impl_range_policy = RangePolicy< typename traits::execution_space
, typename traits::schedule_type
, typename traits::index_type
> ;
typedef MDRangePolicy execution_policy; // needed for is_execution_space interrogation
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
using iteration_pattern = typename traits::iteration_pattern;
using work_tag = typename traits::work_tag;
using launch_bounds = typename traits::launch_bounds;
using member_type = typename range_policy::member_type;
enum { rank = static_cast<int>(iteration_pattern::rank) };
using index_type = typename traits::index_type;
using array_index_type = long;
using point_type = Kokkos::Array<array_index_type,rank>; //was index_type
using tile_type = Kokkos::Array<array_index_type,rank>;
// If point_type or tile_type is not templated on a signed integral type (if it is unsigned),
// then if user passes in intializer_list of runtime-determined values of
// signed integral type that are not const will receive a compiler error due
// to an invalid case for implicit conversion -
// "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
// This would require the user to either pass a matching index_type parameter
// as template parameter to the MDRangePolicy or static_cast the individual values
point_type m_lower;
point_type m_upper;
tile_type m_tile;
point_type m_tile_end;
index_type m_num_tiles;
index_type m_prod_tile_dims;
/*
// NDE enum impl definition alternative - replace static constexpr int ?
enum { outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename traits::execution_space>::value ) };
enum { inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename traits::execution_space>::value ) };
enum { Right = static_cast<int>( Iterate::Right ) };
enum { Left = static_cast<int>( Iterate::Left ) };
*/
//static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename traits::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename traits::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Right = static_cast<int>( Iterate::Right );
static constexpr int Left = static_cast<int>( Iterate::Left );
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
: m_lower(lower)
, m_upper(upper)
, m_tile(tile)
, m_num_tiles(1)
, m_prod_tile_dims(1)
{
// Host
if ( true
#if defined(KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
#endif
)
{
index_type span;
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
if ( m_tile[i] <= 0 ) {
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = span;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
else // Cuda
{
index_type span;
for (int i=0; i<rank; ++i) {
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
if ( m_prod_tile_dims < 512 ) {
m_tile[i] = 2;
} else {
m_tile[i] = 1;
}
}
else {
m_tile[i] = 16;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
}
}
#endif
}
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
{
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
for ( auto i = 0; i < rank; ++i ) {
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
if(static_cast<int>(tile.size())==rank)
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
else
m_tile[i] = 0;
}
m_num_tiles = 1;
m_prod_tile_dims = 1;
// Host
if ( true
#if defined(KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
#endif
)
{
index_type span;
for (int i=0; i<rank; ++i) {
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = span;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
else // Cuda
{
index_type span;
for (int i=0; i<rank; ++i) {
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( ((int)inner_direction == (int)Right && (i < rank-1))
|| ((int)inner_direction == (int)Left && (i > 0)) )
{
if ( m_prod_tile_dims < 512 ) {
m_tile[i] = 2;
} else {
m_tile[i] = 1;
}
}
else {
m_tile[i] = 16;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
m_prod_tile_dims *= m_tile[i];
}
if ( m_prod_tile_dims > 512 ) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 max per dim (Kepler), but product num_threads < 1024
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
}
}
#endif
}
};
} // namespace Kokkos
// For backward compatibility
namespace Kokkos { namespace Experimental {
using Kokkos::MDRangePolicy;
using Kokkos::Rank;
using Kokkos::Iterate;
} } // end Kokkos::Experimental
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_for - deprecated use parallel_for
// ------------------------------------------------------------------ //
namespace Kokkos { namespace Experimental {
template <typename MDRange, typename Functor, typename Enable = void>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
// Cuda specialization
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
template <typename MDRange, typename Functor>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
#endif
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_reduce - deprecated use parallel_reduce
// ------------------------------------------------------------------ //
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( MDRange const& range
, Functor const& f
, ValueType & v
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( const std::string& str
, MDRange const& range
, Functor const& f
, ValueType & v
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
// Cuda - md_parallel_reduce not implemented - use parallel_reduce
} } // namespace Kokkos::Experimental
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP