Updating Kokkos lib to v2.03.05
This commit is contained in:
@ -44,14 +44,12 @@
|
||||
#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
|
||||
#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
|
||||
@ -1298,3 +1296,4 @@ protected:
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,11 +45,10 @@
|
||||
#define KOKKOS_CUDAEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
@ -99,6 +98,8 @@ CudaSpace::size_type cuda_internal_maximum_warp_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_grid_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_shared_words();
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
|
||||
|
||||
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
|
||||
@ -146,7 +147,7 @@ Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -319,3 +320,4 @@ struct CudaParallelLaunch< DriverType , false > {
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
|
||||
|
||||
|
||||
@ -41,16 +41,15 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
@ -890,9 +889,9 @@ void init_lock_arrays_cuda_space() {
|
||||
}
|
||||
}
|
||||
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static size_t current_size = 0;
|
||||
static std::int64_t current_size = 0;
|
||||
if(current_size == 0) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
|
||||
@ -911,5 +910,7 @@ void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
@ -75,7 +73,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
|
||||
new( functor ) DestructFunctor( arg_destruct );
|
||||
|
||||
record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
|
||||
|
||||
|
||||
return record ;
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_ERROR_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
@ -67,3 +65,4 @@ inline void cuda_internal_safe_call( cudaError e , const char * name, const char
|
||||
|
||||
#endif //KOKKOS_ENABLE_CUDA
|
||||
#endif //KOKKOS_CUDA_ERROR_HPP
|
||||
|
||||
|
||||
@ -44,11 +44,11 @@
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Kokkos interfaces */
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
@ -56,7 +56,7 @@
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Standard 'C' libraries */
|
||||
#include <stdlib.h>
|
||||
#include <cstdlib>
|
||||
|
||||
/* Standard 'C++' libraries */
|
||||
#include <vector>
|
||||
@ -404,9 +404,23 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
// Query what compute capability architecture a kernel executes:
|
||||
m_cudaArch = cuda_kernel_arch();
|
||||
|
||||
if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
|
||||
int compiled_major = m_cudaArch / 100;
|
||||
int compiled_minor = ( m_cudaArch % 100 ) / 10;
|
||||
|
||||
if ( compiled_major < 5 && cudaProp.major >= 5 ) {
|
||||
std::stringstream ss;
|
||||
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
|
||||
<< compiled_major << "." << compiled_minor
|
||||
<< " (< 5.0) on device with compute capability "
|
||||
<< cudaProp.major << "." << cudaProp.minor
|
||||
<< " (>=5.0), this would give incorrect results!"
|
||||
<< std::endl ;
|
||||
std::string msg = ss.str();
|
||||
Kokkos::abort( msg.c_str() );
|
||||
}
|
||||
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
|
||||
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
|
||||
<< ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
|
||||
<< compiled_major << "." << compiled_minor
|
||||
<< " on device with compute capability "
|
||||
<< cudaProp.major << "." << cudaProp.minor
|
||||
<< " , this will likely reduce potential performance."
|
||||
@ -661,6 +675,15 @@ void CudaInternal::finalize()
|
||||
Cuda::size_type cuda_internal_multiprocessor_count()
|
||||
{ return CudaInternal::singleton().m_multiProcCount ; }
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
|
||||
{
|
||||
// Compute capability 5.0 through 6.2
|
||||
enum : int { max_resident_blocks_per_multiprocessor = 32 };
|
||||
|
||||
return CudaInternal::singleton().m_multiProcCount
|
||||
* max_resident_blocks_per_multiprocessor ;
|
||||
};
|
||||
|
||||
Cuda::size_type cuda_internal_maximum_warp_count()
|
||||
{ return CudaInternal::singleton().m_maxWarpCount ; }
|
||||
|
||||
@ -772,8 +795,10 @@ void Cuda::fence()
|
||||
Kokkos::Impl::cuda_device_synchronize();
|
||||
}
|
||||
|
||||
const char* Cuda::name() { return "Cuda"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,19 +36,18 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_INTERNAL_HPP
|
||||
#define KOKKOS_CUDA_INTERNAL_HPP
|
||||
#include<iostream>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include<iostream>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_REDUCESCAN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <utility>
|
||||
@ -63,6 +61,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Shuffle operations require input to be a register (stack) variable
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
@ -140,77 +139,6 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Reduce within a warp over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*
|
||||
* Cannot use "butterfly" pattern because floating point
|
||||
* addition is non-associative. Therefore, must broadcast
|
||||
* the final result.
|
||||
*/
|
||||
template< class Reducer >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_reduce( Reducer const & reducer )
|
||||
{
|
||||
static_assert(
|
||||
std::is_reference< typename Reducer::reference_type >::value , "" );
|
||||
|
||||
if ( 1 < blockDim.x ) {
|
||||
|
||||
typename Reducer::value_type tmp ;
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
|
||||
|
||||
if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
|
||||
}
|
||||
|
||||
// Broadcast from root "lane" to all other "lanes"
|
||||
|
||||
cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Inclusive scan over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Algorithm is concurrent bottom-up reductions in triangular pattern
|
||||
* where each CUDA thread is the root of a reduction tree from the
|
||||
* zeroth CUDA thread to itself.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*/
|
||||
template< typename ValueType >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
|
||||
{
|
||||
ValueType tmp ;
|
||||
|
||||
// Bottom up:
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
|
||||
|
||||
cuda_shfl_up( tmp , local , i , blockDim.x );
|
||||
|
||||
if ( i <= threadIdx.x ) { local += tmp ; }
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
@ -247,12 +175,12 @@ inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
const unsigned step = 32 / blockDim.x;
|
||||
unsigned shift = STEP_WIDTH;
|
||||
const int step = 32 / blockDim.x;
|
||||
int shift = STEP_WIDTH;
|
||||
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
|
||||
if(id < STEP_WIDTH ) {
|
||||
result[id] = value;
|
||||
@ -297,7 +225,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
cuda_intra_block_reduction(value,join,max_active_thread);
|
||||
|
||||
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
const int id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
|
||||
//One thread in the block writes block result to global scratch_memory
|
||||
if(id == 0 ) {
|
||||
@ -329,35 +257,35 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
|
||||
//Reduce all global values with splitting work over threads in one warp
|
||||
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
|
||||
for(int i=id; i<gridDim.x; i+=step_size) {
|
||||
for(int i=id; i<(int)gridDim.x; i+=step_size) {
|
||||
value_type tmp = global[i];
|
||||
join(value, tmp);
|
||||
}
|
||||
|
||||
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
|
||||
if (blockDim.x*blockDim.y > 1) {
|
||||
if (int(blockDim.x*blockDim.y) > 1) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 1,32);
|
||||
if( id + 1 < gridDim.x )
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 2) {
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < gridDim.x )
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 4) {
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < gridDim.x )
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 8) {
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < gridDim.x )
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 16) {
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < gridDim.x )
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
}
|
||||
@ -370,6 +298,166 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
#endif
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
unsigned int shift = 1;
|
||||
|
||||
ValueType result = reducer.reference();
|
||||
//Reduce over values from threads with different threadIdx.y
|
||||
while(blockDim.x * shift < 32 ) {
|
||||
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
|
||||
//Only join if upper thread is active (this allows non power of two for blockDim.y
|
||||
if(threadIdx.y + shift < max_active_thread)
|
||||
reducer.join(result , tmp);
|
||||
shift*=2;
|
||||
}
|
||||
|
||||
result = shfl(result,0,32);
|
||||
reducer.reference() = result;
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_inter_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
ValueType value = reducer.reference();
|
||||
const int step = 32 / blockDim.x;
|
||||
int shift = STEP_WIDTH;
|
||||
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
|
||||
if(id < STEP_WIDTH ) {
|
||||
result[id] = value;
|
||||
}
|
||||
__syncthreads();
|
||||
while (shift<=max_active_thread/step) {
|
||||
if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
|
||||
reducer.join(result[id%STEP_WIDTH],value);
|
||||
}
|
||||
__syncthreads();
|
||||
shift+=STEP_WIDTH;
|
||||
}
|
||||
|
||||
|
||||
value = result[0];
|
||||
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
|
||||
reducer.join(value,result[i]);
|
||||
|
||||
reducer.reference() = value;
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_block_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
cuda_intra_warp_reduction(reducer,max_active_thread);
|
||||
cuda_inter_warp_reduction(reducer,max_active_thread);
|
||||
}
|
||||
|
||||
template< class ReducerType>
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value , bool >::type
|
||||
cuda_inter_block_reduction( const ReducerType& reducer,
|
||||
Cuda::size_type * const m_scratch_space,
|
||||
Cuda::size_type * const m_scratch_flags,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
typedef typename ReducerType::value_type* pointer_type;
|
||||
typedef typename ReducerType::value_type value_type;
|
||||
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
cuda_intra_block_reduction(reducer,max_active_thread);
|
||||
|
||||
value_type value = reducer.reference();
|
||||
|
||||
const int id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
|
||||
//One thread in the block writes block result to global scratch_memory
|
||||
if(id == 0 ) {
|
||||
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
|
||||
*global = value;
|
||||
}
|
||||
|
||||
//One warp of last block performs inter block reduction through loading the block values from global scratch_memory
|
||||
bool last_block = false;
|
||||
|
||||
__syncthreads();
|
||||
if ( id < 32 ) {
|
||||
Cuda::size_type count;
|
||||
|
||||
//Figure out whether this is the last block
|
||||
if(id == 0)
|
||||
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
|
||||
count = Kokkos::shfl(count,0,32);
|
||||
|
||||
//Last block does the inter block reduction
|
||||
if( count == gridDim.x - 1) {
|
||||
//set flag back to zero
|
||||
if(id == 0)
|
||||
*m_scratch_flags = 0;
|
||||
last_block = true;
|
||||
reducer.init(value);
|
||||
|
||||
pointer_type const volatile global = (pointer_type) m_scratch_space ;
|
||||
|
||||
//Reduce all global values with splitting work over threads in one warp
|
||||
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
|
||||
for(int i=id; i<(int)gridDim.x; i+=step_size) {
|
||||
value_type tmp = global[i];
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
|
||||
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
|
||||
if (int(blockDim.x*blockDim.y) > 1) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 1,32);
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of Cuda C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
@ -529,11 +617,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
|
||||
size_type * const global = global_data + word_count.value * block_id ;
|
||||
|
||||
#if (__CUDA_ARCH__ < 500)
|
||||
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
|
||||
#else
|
||||
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
#endif
|
||||
//#if (__CUDA_ARCH__ < 500)
|
||||
for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
|
||||
//#else
|
||||
// for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
//#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,15 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -120,7 +121,7 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
}
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
queue->complete( task.ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
@ -157,7 +158,7 @@ printf("cuda_task_queue_execute before\n");
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
//
|
||||
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
|
||||
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
@ -173,7 +174,7 @@ printf("cuda_task_queue_execute after\n");
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
#define KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -325,7 +326,7 @@ ValueType shfl_warp_broadcast
|
||||
return Kokkos::shfl(val, src_lane, width);
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
/*// all-reduce across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
@ -351,7 +352,7 @@ void parallel_reduce
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
}*/
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// if no join() provided, use sum
|
||||
@ -382,13 +383,36 @@ void parallel_reduce
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
//TODO what is the point of creating this temporary?
|
||||
ValueType result = ValueType();
|
||||
reducer.init(result);
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
strided_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
|
||||
result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
reducer.reference() = shfl_warp_broadcast<ValueType>( result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
// all-reduce within team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
/*template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
@ -404,7 +428,7 @@ void parallel_reduce
|
||||
|
||||
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
}*/
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// if no join() provided, use sum
|
||||
@ -436,6 +460,28 @@ void parallel_reduce
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
ValueType result = ValueType();
|
||||
reducer.init(result);
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
multi_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1, val2); },
|
||||
result,
|
||||
blockDim.x);
|
||||
reducer.reference() = shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
|
||||
}
|
||||
// scan across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
|
||||
982
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
Normal file
982
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
Normal file
@ -0,0 +1,982 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_TEAM_HPP
|
||||
#define KOKKOS_CUDA_TEAM_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <utility>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< typename Type >
|
||||
struct CudaJoinFunctor {
|
||||
typedef Type value_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join( volatile value_type & update ,
|
||||
volatile const value_type & input )
|
||||
{ update += input ; }
|
||||
};
|
||||
|
||||
/**\brief Team member_type passed to TeamPolicy or TeamTask closures.
|
||||
*
|
||||
* Cuda thread blocks for team closures are dimensioned as:
|
||||
* blockDim.x == number of "vector lanes" per "thread"
|
||||
* blockDim.y == number of "threads" per team
|
||||
* blockDim.z == number of teams in a block
|
||||
* where
|
||||
* A set of teams exactly fill a warp OR a team is the whole block
|
||||
* ( 0 == WarpSize % ( blockDim.x * blockDim.y ) )
|
||||
* OR
|
||||
* ( 1 == blockDim.z )
|
||||
*
|
||||
* Thus when 1 < blockDim.z the team is warp-synchronous
|
||||
* and __syncthreads should not be called in team collectives.
|
||||
*
|
||||
* When multiple teams are mapped onto a single block then the
|
||||
* total available shared memory must be partitioned among teams.
|
||||
*/
|
||||
class CudaTeamMember {
|
||||
private:
|
||||
|
||||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
void * m_team_reduce ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_reduce_size ;
|
||||
int m_league_rank ;
|
||||
int m_league_size ;
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space &
|
||||
team_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space &
|
||||
thread_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return threadIdx.y ;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return blockDim.y ;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
if ( 1 == blockDim.z ) __syncthreads(); // team == block
|
||||
else __threadfence_block(); // team <= warp
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast( ValueType & val, const int& thread_id) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
if ( 1 == blockDim.z ) { // team == block
|
||||
__syncthreads();
|
||||
// Wait for shared data write until all threads arrive here
|
||||
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
|
||||
*((ValueType*) m_team_reduce) = val ;
|
||||
}
|
||||
__syncthreads(); // Wait for shared data read until root thread writes
|
||||
val = *((ValueType*) m_team_reduce);
|
||||
}
|
||||
else { // team <= warp
|
||||
ValueType tmp( val ); // input might not be a register variable
|
||||
cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/**\brief Reduction across a team
|
||||
*
|
||||
* Mapping of teams onto blocks:
|
||||
* blockDim.x is "vector lanes"
|
||||
* blockDim.y is team "threads"
|
||||
* blockDim.z is number of teams per block
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power two
|
||||
* blockDim.x <= CudaTraits::WarpSize
|
||||
* ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y )
|
||||
* OR
|
||||
* ( 1 == blockDim.z )
|
||||
*/
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
team_reduce( ReducerType const & reducer ) const noexcept
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
// reduce within the warp using shuffle
|
||||
|
||||
const int wx =
|
||||
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
|
||||
|
||||
// Root of each vector lane reduces:
|
||||
if ( 0 == threadIdx.x && wx < i ) {
|
||||
reducer.join( tmp , reducer.reference() );
|
||||
}
|
||||
}
|
||||
|
||||
if ( 1 < blockDim.z ) { // team <= warp
|
||||
// broadcast result from root vector lange of root thread
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp
|
||||
, blockDim.x * threadIdx.y , CudaTraits::WarpSize );
|
||||
|
||||
}
|
||||
else { // team == block
|
||||
// Reduce across warps using shared memory
|
||||
// Broadcast result within block
|
||||
|
||||
// Number of warps, blockDim.y may not be power of two:
|
||||
const int nw = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Warp index:
|
||||
const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Number of shared memory entries for the reduction:
|
||||
int nsh = m_team_reduce_size / sizeof(value_type);
|
||||
|
||||
// Using at most one entry per warp:
|
||||
if ( nw < nsh ) nsh = nw ;
|
||||
|
||||
__syncthreads(); // Wait before shared data write
|
||||
|
||||
if ( 0 == wx && wy < nsh ) {
|
||||
((value_type*) m_team_reduce)[wy] = tmp ;
|
||||
}
|
||||
|
||||
// When more warps than shared entries:
|
||||
for ( int i = nsh ; i < nw ; i += nsh ) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if ( 0 == wx && i <= wy ) {
|
||||
const int k = wy - i ;
|
||||
if ( k < nsh ) {
|
||||
reducer.join( *((value_type*) m_team_reduce + k) , tmp );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// One warp performs the inter-warp reduction:
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Start at power of two covering nsh
|
||||
|
||||
for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nsh ) {
|
||||
reducer.join( ((value_type*)m_team_reduce)[wx]
|
||||
, ((value_type*)m_team_reduce)[k] );
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for reduction
|
||||
|
||||
// Broadcast result to all threads
|
||||
reducer.reference() = *((value_type*)m_team_reduce);
|
||||
}
|
||||
|
||||
#endif /* #ifdef __CUDA_ARCH__ */
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Type team_scan( const Type & value , Type * const global_accum ) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
Type * const base_data = (Type *) m_team_reduce ;
|
||||
|
||||
__syncthreads(); // Don't write in to shared data until all threads have entered this function
|
||||
|
||||
if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
|
||||
|
||||
base_data[ threadIdx.y + 1 ] = value ;
|
||||
|
||||
Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
|
||||
|
||||
if ( global_accum ) {
|
||||
if ( blockDim.y == threadIdx.y + 1 ) {
|
||||
base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
|
||||
}
|
||||
__syncthreads(); // Wait for atomic
|
||||
base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
|
||||
}
|
||||
|
||||
return base_data[ threadIdx.y ];
|
||||
#else
|
||||
return Type();
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
|
||||
return this->template team_scan<Type>( value , 0 );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
vector_reduce( ReducerType const & reducer )
|
||||
{
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(blockDim.x == 1) return;
|
||||
|
||||
// Intra vector lane shuffle reduction:
|
||||
typename ReducerType::value_type tmp ( reducer.reference() );
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
|
||||
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
// Cannot use "butterfly" algorithm to avoid the broadcast
|
||||
// because floating point summation is not associative
|
||||
// and thus different threads could have different results.
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x );
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/**\brief Global reduction across all blocks
|
||||
*
|
||||
* Return !0 if reducer contains the final value
|
||||
*/
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value , int >::type
|
||||
global_reduce( ReducerType const & reducer
|
||||
, int * const global_scratch_flags
|
||||
, void * const global_scratch_space
|
||||
, void * const shmem
|
||||
, int const shmem_size
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
typedef value_type volatile * pointer_type ;
|
||||
|
||||
// Number of shared memory entries for the reduction:
|
||||
const int nsh = shmem_size / sizeof(value_type);
|
||||
|
||||
// Number of CUDA threads in the block, rank within the block
|
||||
const int nid = blockDim.x * blockDim.y * blockDim.z ;
|
||||
const int tid = threadIdx.x + blockDim.x * (
|
||||
threadIdx.y + blockDim.y * threadIdx.z );
|
||||
|
||||
// Reduces within block using all available shared memory
|
||||
// Contributes if it is the root "vector lane"
|
||||
|
||||
// wn == number of warps in the block
|
||||
// wx == which lane within the warp
|
||||
// wy == which warp within the block
|
||||
|
||||
const int wn = ( nid + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
|
||||
const int wx = tid & CudaTraits::WarpIndexMask ;
|
||||
const int wy = tid >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
//------------------------
|
||||
{ // Intra warp shuffle reduction from contributing CUDA threads
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
|
||||
|
||||
// Root of each vector lane reduces "thread" contribution
|
||||
if ( 0 == threadIdx.x && wx < i ) {
|
||||
reducer.join( & tmp , reducer.data() );
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce across warps using shared memory.
|
||||
// Number of warps may not be power of two.
|
||||
|
||||
__syncthreads(); // Wait before shared data write
|
||||
|
||||
// Number of shared memory entries for the reduction
|
||||
// is at most one per warp
|
||||
const int nentry = wn < nsh ? wn : nsh ;
|
||||
|
||||
if ( 0 == wx && wy < nentry ) {
|
||||
// Root thread of warp 'wy' has warp's value to contribute
|
||||
((value_type*) shmem)[wy] = tmp ;
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for write to be visible to block
|
||||
|
||||
// When more warps than shared entries
|
||||
// then warps must take turns joining their contribution
|
||||
// to the designated shared memory entry.
|
||||
for ( int i = nentry ; i < wn ; i += nentry ) {
|
||||
|
||||
const int k = wy - i ;
|
||||
|
||||
if ( 0 == wx && i <= wy && k < nentry ) {
|
||||
// Root thread of warp 'wy' has warp's value to contribute
|
||||
reducer.join( ((value_type*) shmem) + k , & tmp );
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for write to be visible to block
|
||||
}
|
||||
|
||||
// One warp performs the inter-warp reduction:
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Start fan-in at power of two covering nentry
|
||||
|
||||
for ( int i = ( 1 << ( 32 - __clz(nentry-1) ) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + k );
|
||||
__threadfence_block(); // Wait for write to be visible to warp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//------------------------
|
||||
{ // Write block's value to global_scratch_memory
|
||||
|
||||
int last_block = 0 ;
|
||||
|
||||
if ( 0 == wx ) {
|
||||
reducer.copy( ((pointer_type)global_scratch_space)
|
||||
+ blockIdx.x * reducer.length()
|
||||
, reducer.data() );
|
||||
|
||||
__threadfence(); // Wait until global write is visible.
|
||||
|
||||
last_block = gridDim.x ==
|
||||
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
|
||||
|
||||
// If last block then reset count
|
||||
if ( last_block ) *global_scratch_flags = 0 ;
|
||||
}
|
||||
|
||||
last_block = __syncthreads_or( last_block );
|
||||
|
||||
if ( ! last_block ) return 0 ;
|
||||
|
||||
}
|
||||
//------------------------
|
||||
// Last block reads global_scratch_memory into shared memory.
|
||||
|
||||
const int nentry = nid < gridDim.x ?
|
||||
( nid < nsh ? nid : nsh ) :
|
||||
( gridDim.x < nsh ? gridDim.x : nsh ) ;
|
||||
|
||||
// nentry = min( nid , nsh , gridDim.x )
|
||||
|
||||
// whole block reads global memory into shared memory:
|
||||
|
||||
if ( tid < nentry ) {
|
||||
|
||||
const int offset = tid * reducer.length();
|
||||
|
||||
reducer.copy( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space) + offset );
|
||||
|
||||
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space)
|
||||
+ i * reducer.length() );
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for writes to be visible to block
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Iterate to reduce shared memory to single warp fan-in size
|
||||
|
||||
const int nreduce = CudaTraits::WarpSize < nentry
|
||||
? CudaTraits::WarpSize : nentry ;
|
||||
|
||||
// nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
|
||||
|
||||
if ( wx < nreduce && nreduce < nentry ) {
|
||||
for ( int i = nreduce + wx ; i < nentry ; i += nreduce ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + i );
|
||||
}
|
||||
__threadfence_block(); // Wait for writes to be visible to warp
|
||||
}
|
||||
|
||||
// Start fan-in at power of two covering nentry
|
||||
|
||||
for ( int i = ( 1 << ( 32 - __clz(nreduce-1) ) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nreduce ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + k );
|
||||
__threadfence_block(); // Wait for writes to be visible to warp
|
||||
}
|
||||
}
|
||||
|
||||
if ( 0 == wx ) {
|
||||
reducer.copy( reducer.data() , (pointer_type)shmem );
|
||||
return 1 ;
|
||||
}
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTeamMember( void * shared
|
||||
, const int shared_begin
|
||||
, const int shared_size
|
||||
, void* scratch_level_1_ptr
|
||||
, const int scratch_level_1_size
|
||||
, const int arg_league_rank
|
||||
, const int arg_league_size )
|
||||
: m_team_reduce( shared )
|
||||
, m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size)
|
||||
, m_team_reduce_size( shared_begin )
|
||||
, m_league_rank( arg_league_rank )
|
||||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
} // namspace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
typedef iType index_type;
|
||||
const CudaTeamMember& member;
|
||||
const iType start;
|
||||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
|
||||
: member(thread_)
|
||||
, start( 0 )
|
||||
, end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
|
||||
: member(thread_)
|
||||
, start( begin_ )
|
||||
, end( end_ ) {}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count)
|
||||
: start( 0 ), end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const iType& count)
|
||||
: start( 0 ), end( count ) {}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Inter-thread parallel_for.
|
||||
*
|
||||
* Executes closure(iType i) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the the calling thread team.
|
||||
*/
|
||||
template<typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y )
|
||||
closure(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Inter-thread parallel_reduce with a reducer.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the
|
||||
* calling thread team and a summation of val is
|
||||
* performed and put into result.
|
||||
*/
|
||||
template< typename iType, class Closure, class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, const ReducerType & reducer
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,reducer.reference());
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/** \brief Inter-thread parallel_reduce assuming summation.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the
|
||||
* calling thread team and a summation of val is
|
||||
* performed and put into result.
|
||||
*/
|
||||
template< typename iType, class Closure, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, ValueType & result
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
Kokkos::Experimental::Sum<ValueType> reducer(result);
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,result);
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel_for.
|
||||
*
|
||||
* Executes closure(iType i) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of the the calling thread.
|
||||
*/
|
||||
template<typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* Calls closure(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed using +=
|
||||
* and output into result.
|
||||
*
|
||||
* The identity value for the += operator is assumed to be the default
|
||||
* constructed value.
|
||||
*/
|
||||
template< typename iType, class Closure, class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
const & loop_boundaries
|
||||
, Closure const & closure
|
||||
, ReducerType const & reducer )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i,reducer.reference());
|
||||
}
|
||||
|
||||
Impl::CudaTeamMember::vector_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* Calls closure(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed using +=
|
||||
* and output into result.
|
||||
*
|
||||
* The identity value for the += operator is assumed to be the default
|
||||
* constructed value.
|
||||
*/
|
||||
template< typename iType, class Closure, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! is_reducer< ValueType >::value >::type
|
||||
parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
const & loop_boundaries
|
||||
, Closure const & closure
|
||||
, ValueType & result )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
result = ValueType();
|
||||
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i,result);
|
||||
}
|
||||
|
||||
Impl::CudaTeamMember::vector_reduce(
|
||||
Kokkos::Experimental::Sum<ValueType>(result ) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes in the
|
||||
* thread and a scan operation is performed.
|
||||
* The last call to closure has final == true.
|
||||
*/
|
||||
template< typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
// Extract value_type from closure
|
||||
|
||||
using value_type =
|
||||
typename Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::SCAN
|
||||
, void
|
||||
, Closure >::value_type ;
|
||||
|
||||
// Loop through boundaries by vector-length chunks
|
||||
// must scan at each iteration
|
||||
|
||||
value_type accum = 0 ;
|
||||
|
||||
// All thread "lanes" must loop the same number of times.
|
||||
// Determine an loop end for all thread "lanes."
|
||||
// Requires:
|
||||
// blockDim.x is power of two and thus
|
||||
// ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
|
||||
// 1 <= blockDim.x <= CudaTraits::WarpSize
|
||||
|
||||
const int mask = blockDim.x - 1 ;
|
||||
const int rem = loop_boundaries.end & mask ; // == end % blockDim.x
|
||||
const int end = loop_boundaries.end + ( rem ? blockDim.x - rem : 0 );
|
||||
|
||||
for ( int i = threadIdx.x ; i < end ; i += blockDim.x ) {
|
||||
|
||||
value_type val = 0 ;
|
||||
|
||||
// First acquire per-lane contributions:
|
||||
if ( i < loop_boundaries.end ) closure( i , val , false );
|
||||
|
||||
value_type sval = val ;
|
||||
|
||||
// Bottom up inclusive scan in triangular pattern
|
||||
// where each CUDA thread is the root of a reduction tree
|
||||
// from the zeroth "lane" to itself.
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
|
||||
value_type tmp = 0 ;
|
||||
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
|
||||
if ( j <= threadIdx.x ) { sval += tmp ; }
|
||||
}
|
||||
|
||||
// Include accumulation and remove value for exclusive scan:
|
||||
val = accum + sval - val ;
|
||||
|
||||
// Provide exclusive scan value:
|
||||
if ( i < loop_boundaries.end ) closure( i , val , true );
|
||||
|
||||
// Accumulate the last value in the inclusive scan:
|
||||
Impl::cuda_shfl( sval , sval , mask , blockDim.x );
|
||||
|
||||
accum += sval ;
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda(val);
|
||||
val = shfl(val,0,blockDim.x);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) {
|
||||
lambda(val);
|
||||
}
|
||||
single_struct.team_member.team_broadcast(val,0);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,8 +44,6 @@
|
||||
#define KOKKOS_CUDA_VECTORIZATION_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
@ -296,3 +294,4 @@ namespace Impl {
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,7 +44,7 @@
|
||||
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -46,7 +46,7 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
#include "Kokkos_Macros.hpp"
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <cuda.h>
|
||||
@ -82,6 +82,8 @@ void cuda_abort( const char * const message )
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
|
||||
|
||||
|
||||
@ -236,13 +236,13 @@ struct MDRangePolicy
|
||||
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
|
||||
|
||||
#else
|
||||
if(m_lower.size()!=rank || m_upper.size() != rank)
|
||||
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
|
||||
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
|
||||
|
||||
for ( auto i = 0; i < rank; ++i ) {
|
||||
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
|
||||
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
|
||||
if(tile.size()==rank)
|
||||
if(static_cast<int>(tile.size())==rank)
|
||||
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
|
||||
else
|
||||
m_tile[i] = 0;
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
#define KOKKOS_COMPLEX_HPP
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_NumericTraits.hpp>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
@ -324,9 +325,30 @@ public:
|
||||
im_ /= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& src) {
|
||||
return (re_ == src.re_) && (im_ == src.im_);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType src) {
|
||||
return (re_ == src) && (im_ == RealType(0));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& src) {
|
||||
return (re_ != src.re_) || (im_ != src.im_);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType src) {
|
||||
return (re_ != src) || (im_ != RealType(0));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//! Binary + operator for complex.
|
||||
//! Binary + operator for complex complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
@ -334,6 +356,22 @@ operator + (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for complex scalar.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () + y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for scalar complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x + y.real (), y.imag ());
|
||||
}
|
||||
|
||||
//! Unary + operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -350,6 +388,22 @@ operator - (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for complex scalar.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () - y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for scalar complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x - y.real (), - y.imag ());
|
||||
}
|
||||
|
||||
//! Unary - operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -395,6 +449,16 @@ operator * (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& y, const RealType& x) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
//! Imaginary part of a complex number.
|
||||
template<class RealType>
|
||||
@ -415,7 +479,25 @@ template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType abs (const complex<RealType>& x) {
|
||||
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
|
||||
return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
}
|
||||
|
||||
//! Power of a complex number
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::complex<RealType> pow (const complex<RealType>& x, const RealType& e) {
|
||||
RealType r = abs(x);
|
||||
RealType phi = std::atan(x.imag()/x.real());
|
||||
return std::pow(r,e) * Kokkos::complex<RealType>(std::cos(phi*e),std::sin(phi*e));
|
||||
}
|
||||
|
||||
//! Square root of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::complex<RealType> sqrt (const complex<RealType>& x) {
|
||||
RealType r = abs(x);
|
||||
RealType phi = std::atan(x.imag()/x.real());
|
||||
return std::sqrt(r) * Kokkos::complex<RealType>(std::cos(phi*0.5),std::sin(phi*0.5));
|
||||
}
|
||||
|
||||
//! Conjugate of a complex number.
|
||||
@ -425,6 +507,19 @@ complex<RealType> conj (const complex<RealType>& x) {
|
||||
return complex<RealType> (real (x), -imag (x));
|
||||
}
|
||||
|
||||
//! Exponential of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType> exp (const complex<RealType>& x) {
|
||||
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
|
||||
}
|
||||
|
||||
//! Exponential of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType> pow (const complex<RealType>& x) {
|
||||
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
|
||||
}
|
||||
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
@ -461,6 +556,14 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
}
|
||||
}
|
||||
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
operator / (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<RealType1> (x)/y;
|
||||
}
|
||||
|
||||
//! Equality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -468,9 +571,13 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) == real (y) && imag (x) == imag (y);
|
||||
}
|
||||
|
||||
//! Equality operator for std::complex and Kokkos::complex.
|
||||
/// \brief Equality operator for std::complex and Kokkos::complex.
|
||||
///
|
||||
/// This cannot be a device function, since std::real is not.
|
||||
/// Otherwise, CUDA builds will give compiler warnings ("warning:
|
||||
/// calling a constexpr __host__ function("real") from a __host__
|
||||
/// __device__ function("operator==") is not allowed").
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) == real (y) && std::imag (x) == imag (y);
|
||||
}
|
||||
@ -533,6 +640,15 @@ std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
struct reduction_identity<Kokkos::complex<T> > {
|
||||
typedef reduction_identity<T> t_red_ident;
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> sum()
|
||||
{return Kokkos::complex<T>(t_red_ident::sum(),t_red_ident::sum());}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> prod()
|
||||
{return Kokkos::complex<T>(t_red_ident::prod(),t_red_ident::sum());}
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_COMPLEX_HPP
|
||||
|
||||
@ -200,7 +200,14 @@ public:
|
||||
, Kokkos::DefaultHostExecutionSpace , execution_space
|
||||
>::type host_execution_space ;
|
||||
#else
|
||||
typedef execution_space host_execution_space ;
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
typedef typename std::conditional
|
||||
< std::is_same< execution_space , Kokkos::Experimental::OpenMPTarget >::value
|
||||
, Kokkos::DefaultHostExecutionSpace , execution_space
|
||||
>::type host_execution_space ;
|
||||
#else
|
||||
typedef execution_space host_execution_space ;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef typename std::conditional
|
||||
|
||||
@ -57,11 +57,16 @@
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
#endif
|
||||
|
||||
//#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
#include <Kokkos_OpenMPTarget.hpp>
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
//#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
#include <Kokkos_Qthreads.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
#include <Kokkos_Threads.hpp>
|
||||
#endif
|
||||
|
||||
@ -69,8 +74,8 @@
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <Kokkos_Pair.hpp>
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <Kokkos_Array.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
@ -167,3 +172,4 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -96,7 +96,7 @@ class Serial; ///< Execution space main process on CPU.
|
||||
class Qthreads; ///< Execution space with Qthreads back-end.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
class Threads; ///< Execution space with pthreads back-end.
|
||||
#endif
|
||||
|
||||
@ -104,6 +104,14 @@ class Threads; ///< Execution space with pthreads back-end.
|
||||
class OpenMP; ///< OpenMP execution space.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
namespace Experimental {
|
||||
class OpenMPTarget; ///< OpenMPTarget execution space.
|
||||
class OpenMPTargetSpace;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
class CudaSpace; ///< Memory space on Cuda GPU
|
||||
class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM
|
||||
@ -121,12 +129,14 @@ struct Device;
|
||||
|
||||
/// Define Kokkos::DefaultExecutionSpace as per configuration option
|
||||
/// or chosen from the enabled execution spaces in the following order:
|
||||
/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
|
||||
/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
typedef Cuda DefaultExecutionSpace;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
@ -136,7 +146,7 @@ namespace Kokkos {
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultExecutionSpace;
|
||||
#else
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
@ -149,7 +159,7 @@ namespace Kokkos {
|
||||
typedef Serial DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Threads DefaultHostExecutionSpace;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Qthreads DefaultHostExecutionSpace;
|
||||
@ -254,6 +264,21 @@ template< class FunctorType, class ExecPolicy, class ExecutionSapce =
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
namespace Experimental {
|
||||
template<class ScalarType , class Space = HostSpace> struct Sum;
|
||||
template<class ScalarType , class Space = HostSpace> struct Prod;
|
||||
template<class ScalarType , class Space = HostSpace> struct Min;
|
||||
template<class ScalarType , class Space = HostSpace> struct Max;
|
||||
template<class ScalarType , class Space = HostSpace> struct MinMax;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MinLoc;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MaxLoc;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MinMaxLoc;
|
||||
template<class ScalarType , class Space = HostSpace> struct BAnd;
|
||||
template<class ScalarType , class Space = HostSpace> struct BOr;
|
||||
template<class ScalarType , class Space = HostSpace> struct LAnd;
|
||||
template<class ScalarType , class Space = HostSpace> struct LOr;
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
|
||||
|
||||
|
||||
@ -44,12 +44,11 @@
|
||||
#ifndef KOKKOS_CUDA_HPP
|
||||
#define KOKKOS_CUDA_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// If CUDA execution space is enabled then use this header file.
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <iosfwd>
|
||||
#include <vector>
|
||||
|
||||
@ -214,6 +213,8 @@ public:
|
||||
//@}
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
static const char* name();
|
||||
|
||||
private:
|
||||
|
||||
cudaStream_t m_stream ;
|
||||
@ -291,6 +292,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_View.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Team.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
|
||||
@ -300,5 +302,3 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_HPP */
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,10 +44,11 @@
|
||||
#ifndef KOKKOS_CUDASPACE_HPP
|
||||
#define KOKKOS_CUDASPACE_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <iosfwd>
|
||||
#include <typeinfo>
|
||||
#include <string>
|
||||
@ -717,7 +718,7 @@ private:
|
||||
static ::cudaTextureObject_t
|
||||
attach_texture_object( const unsigned sizeof_alias
|
||||
, void * const alloc_ptr
|
||||
, const size_t alloc_size );
|
||||
, const size_t alloc_size );
|
||||
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
|
||||
@ -565,5 +565,3 @@ ThreadVectorRange( const TeamMemberType&, const iType& count );
|
||||
|
||||
#endif /* #define KOKKOS_EXECPOLICY_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -44,12 +44,11 @@
|
||||
#ifndef KOKKOS_HBWSPACE_HPP
|
||||
#define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
@ -114,7 +113,7 @@ public:
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
@ -348,5 +347,5 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokk
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
||||
#endif // #define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
|
||||
@ -117,7 +117,7 @@ public:
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
@ -265,7 +265,7 @@ public:
|
||||
return (SharedAllocationRecord *) 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
@ -316,3 +316,4 @@ struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // #define KOKKOS_HOSTSPACE_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -47,7 +47,7 @@
|
||||
#ifndef KOKKOS_LAYOUT_HPP
|
||||
#define KOKKOS_LAYOUT_HPP
|
||||
|
||||
#include <stddef.h>
|
||||
#include <cstddef>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
@ -62,7 +62,7 @@ enum { ARRAY_LAYOUT_MAX_RANK = 8 };
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Layout left" indicates a mapping where the leftmost index i0
|
||||
/// refers to contiguous access, and strides increase for dimensions
|
||||
@ -95,7 +95,7 @@ struct LayoutLeft {
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Right layout" indicates a mapping where the rightmost index ik
|
||||
/// refers to contiguous access, and strides increase for dimensions
|
||||
@ -130,7 +130,7 @@ struct LayoutStride {
|
||||
typedef LayoutStride array_layout ;
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
|
||||
LayoutStride( LayoutStride const & ) = default ;
|
||||
LayoutStride( LayoutStride && ) = default ;
|
||||
@ -192,7 +192,7 @@ struct LayoutStride {
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Tiled layout" indicates a mapping to contiguously stored
|
||||
/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
|
||||
|
||||
@ -48,9 +48,10 @@
|
||||
/** Pick up configure / build options via #define macros:
|
||||
*
|
||||
* KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces
|
||||
* KOKKOS_ENABLE_PTHREAD Kokkos::Threads execution space
|
||||
* KOKKOS_ENABLE_THREADS Kokkos::Threads execution space
|
||||
* KOKKOS_ENABLE_QTHREADS Kokkos::Qthreads execution space
|
||||
* KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space
|
||||
* KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget execution space
|
||||
* KOKKOS_ENABLE_HWLOC HWLOC library is available.
|
||||
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive!
|
||||
* KOKKOS_ENABLE_MPI Negotiate MPI/execution space interactions.
|
||||
@ -414,6 +415,7 @@
|
||||
// There is zero or one default execution space specified.
|
||||
|
||||
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
|
||||
@ -424,15 +426,18 @@
|
||||
// If default is not specified then chose from enabled execution spaces.
|
||||
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
#elif defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
|
||||
@ -459,10 +464,17 @@
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Enable Profiling by default
|
||||
// If compiling with CUDA then must be using CUDA 8 or better
|
||||
// and use relocateable device code to enable the task policy.
|
||||
// nvcc relocatable device code option: --relocatable-device-code=true
|
||||
|
||||
#ifndef KOKKOS_ENABLE_PROFILING
|
||||
#define KOKKOS_ENABLE_PROFILING 1
|
||||
#if ( defined( KOKKOS_ENABLE_CUDA ) )
|
||||
#if ( 8000 <= CUDA_VERSION ) && defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
|
||||
#endif // #ifndef KOKKOS_MACROS_HPP
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
217
lib/kokkos/core/src/Kokkos_NumericTraits.hpp
Normal file
217
lib/kokkos/core/src/Kokkos_NumericTraits.hpp
Normal file
@ -0,0 +1,217 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_NUMERICTRAITS_HPP
|
||||
#define KOKKOS_NUMERICTRAITS_HPP
|
||||
|
||||
#include<climits>
|
||||
#include<cfloat>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class T>
|
||||
struct reduction_identity; /*{
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); } // 0
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod() // 1
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom prod reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T max() // minimum value
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom max reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T min() // maximum value
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom min reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor() // 0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom bor reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T band() // !0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom band reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor() // 0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom lor reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T land() // !0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); }
|
||||
};*/
|
||||
|
||||
template<>
|
||||
struct reduction_identity<signed char> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum() {return static_cast<signed char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() {return static_cast<signed char>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max() {return SCHAR_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min() {return SCHAR_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor() {return static_cast<signed char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() {return ~static_cast<signed char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor() {return static_cast<signed char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() {return static_cast<signed char>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<short> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum() {return static_cast<short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() {return static_cast<short>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short max() {return SHRT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short min() {return SHRT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor() {return static_cast<short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() {return ~static_cast<short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor() {return static_cast<short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() {return static_cast<short>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<int> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum() {return static_cast<int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() {return static_cast<int>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int max() {return INT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int min() {return INT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor() {return static_cast<int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() {return ~static_cast<int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor() {return static_cast<int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() {return static_cast<int>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum() {return static_cast<long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() {return static_cast<long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long max() {return LLONG_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long min() {return LLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor() {return static_cast<long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() {return ~static_cast<long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor() {return static_cast<long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() {return static_cast<long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum() {return static_cast<long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() {return static_cast<long long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max() {return LLONG_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min() {return LLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor() {return static_cast<long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() {return ~static_cast<long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor() {return static_cast<long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() {return static_cast<long long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned char> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() {return static_cast<unsigned char>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min() {return UCHAR_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor() {return static_cast<unsigned char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() {return ~static_cast<unsigned char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() {return static_cast<unsigned char>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned short> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() {return static_cast<unsigned short>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min() {return USHRT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor() {return static_cast<unsigned short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() {return ~static_cast<unsigned short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() {return static_cast<unsigned short>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned int> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() {return static_cast<unsigned int>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min() {return UINT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor() {return static_cast<unsigned int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() {return ~static_cast<unsigned int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() {return static_cast<unsigned int>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() {return static_cast<unsigned long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min() {return ULONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor() {return static_cast<unsigned long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() {return ~static_cast<unsigned long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() {return static_cast<unsigned long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned long long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() {return static_cast<unsigned long long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min() {return ULLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor() {return static_cast<unsigned long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() {return ~static_cast<unsigned long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() {return static_cast<unsigned long long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<float> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -44,22 +44,23 @@
|
||||
#ifndef KOKKOS_OPENMP_HPP
|
||||
#define KOKKOS_OPENMP_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP)
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP) && !defined(_OPENMP)
|
||||
#if !defined(_OPENMP)
|
||||
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
#include <Kokkos_HBWSpace.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskScheduler.hpp>
|
||||
@ -79,26 +80,26 @@ public:
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef OpenMP execution_space ;
|
||||
using execution_space = OpenMP;
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
typedef Experimental::HBWSpace memory_space ;
|
||||
using memory_space = Experimental::HBWSpace;
|
||||
#else
|
||||
typedef HostSpace memory_space ;
|
||||
using memory_space = HostSpace;
|
||||
#endif
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
using device_type = Kokkos::Device<execution_space,memory_space>;
|
||||
|
||||
typedef LayoutRight array_layout ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
using array_layout = LayoutRight;
|
||||
using size_type = memory_space::size_type;
|
||||
|
||||
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
|
||||
using scratch_memory_space = ScratchMemorySpace< OpenMP >;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos execution spaces must implement.
|
||||
//@{
|
||||
|
||||
inline static bool in_parallel() { return omp_in_parallel(); }
|
||||
inline static bool in_parallel();
|
||||
|
||||
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
|
||||
static bool sleep();
|
||||
@ -153,6 +154,8 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -164,7 +167,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::OpenMP::memory_space
|
||||
, Kokkos::OpenMP::scratch_memory_space
|
||||
>
|
||||
@ -191,7 +194,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
|
||||
|
||||
@ -201,4 +204,3 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
|
||||
#endif /* #ifndef KOKKOS_OPENMP_HPP */
|
||||
|
||||
|
||||
|
||||
186
lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
Normal file
186
lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGET_HPP
|
||||
#define KOKKOS_OPENMPTARGET_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/// \class OpenMPTarget
|
||||
/// \brief Kokkos device for multicore processors in the host memory space.
|
||||
class OpenMPTarget {
|
||||
public:
|
||||
//------------------------------------
|
||||
//! \name Type declarations that all Kokkos devices must provide.
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef OpenMPTarget execution_space ;
|
||||
typedef OpenMPTargetSpace memory_space ;
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
typedef LayoutLeft array_layout ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
|
||||
typedef ScratchMemorySpace< OpenMPTarget > scratch_memory_space ;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos execution spaces must implement.
|
||||
//@{
|
||||
|
||||
inline static bool in_parallel() { return omp_in_parallel(); }
|
||||
|
||||
/** \brief Set the device in a "sleep" state. A noop for OpenMPTarget. */
|
||||
static bool sleep();
|
||||
|
||||
/** \brief Wake the device from the 'sleep' state. A noop for OpenMPTarget. */
|
||||
static bool wake();
|
||||
|
||||
/** \brief Wait until all dispatched functors complete. A noop for OpenMPTarget. */
|
||||
static void fence() {}
|
||||
|
||||
/// \brief Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool detail = false );
|
||||
|
||||
/// \brief Free any resources being consumed by the device.
|
||||
static void finalize();
|
||||
|
||||
/** \brief Initialize the device.
|
||||
*
|
||||
* 1) If the hardware locality library is enabled and OpenMPTarget has not
|
||||
* already bound threads then bind OpenMPTarget threads to maximize
|
||||
* core utilization and group for memory hierarchy locality.
|
||||
*
|
||||
* 2) Allocate a HostThread for each OpenMPTarget thread to hold its
|
||||
* topology and fan in/out data.
|
||||
*/
|
||||
static void initialize( unsigned thread_count = 0 ,
|
||||
unsigned use_numa_count = 0 ,
|
||||
unsigned use_cores_per_numa = 0 );
|
||||
|
||||
static int is_initialized();
|
||||
|
||||
/** \brief Return the maximum amount of concurrency. */
|
||||
static int concurrency();
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
/** \brief This execution space has a topological thread pool which can be queried.
|
||||
*
|
||||
* All threads within a pool have a common memory space for which they are cache coherent.
|
||||
* depth = 0 gives the number of threads in the whole pool.
|
||||
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
|
||||
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
|
||||
*/
|
||||
inline static int thread_pool_size( int depth = 0 );
|
||||
|
||||
/** \brief The rank of the executing thread in this thread pool */
|
||||
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
|
||||
|
||||
//------------------------------------
|
||||
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
private:
|
||||
static bool m_is_initialized;
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Experimental::OpenMPTarget::memory_space
|
||||
, Kokkos::Experimental::OpenMPTarget::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */
|
||||
#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */
|
||||
|
||||
|
||||
265
lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
Normal file
265
lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGETSPACE_HPP
|
||||
#define KOKKOS_OPENMPTARGETSPACE_HPP
|
||||
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <iosfwd>
|
||||
#include <typeinfo>
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <omp.h>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
///
|
||||
/// Arbitrary atomics are implemented using a hash table of locks
|
||||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function initializes the locks to zero (unset).
|
||||
//void init_lock_array_host_space();
|
||||
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
//bool lock_address_host_space(void* ptr);
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
//void unlock_address_host_space(void* ptr);
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/// \class OpenMPTargetSpace
|
||||
/// \brief Memory management for host memory.
|
||||
///
|
||||
/// OpenMPTargetSpace is a memory space that governs host memory. "Host"
|
||||
/// memory means the usual CPU-accessible memory.
|
||||
class OpenMPTargetSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef OpenMPTargetSpace memory_space ;
|
||||
typedef size_t size_type ;
|
||||
|
||||
/// \typedef execution_space
|
||||
/// \brief Default execution space for this memory space.
|
||||
///
|
||||
/// Every memory space has a default execution space. This is
|
||||
/// useful for things like initializing a View (which happens in
|
||||
/// parallel using the View's default execution space).
|
||||
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
|
||||
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
OpenMPTargetSpace();
|
||||
OpenMPTargetSpace( OpenMPTargetSpace && rhs ) = default ;
|
||||
OpenMPTargetSpace( const OpenMPTargetSpace & rhs ) = default ;
|
||||
OpenMPTargetSpace & operator = ( OpenMPTargetSpace && ) = default ;
|
||||
OpenMPTargetSpace & operator = ( const OpenMPTargetSpace & ) = default ;
|
||||
~OpenMPTargetSpace() = default ;
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
private:
|
||||
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > ;
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
friend Kokkos::Experimental::OpenMPTargetSpace ;
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
/**\brief Root record for tracked allocations from this OpenMPTargetSpace instance */
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
const Kokkos::Experimental::OpenMPTargetSpace m_space ;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() = default ;
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const RecordBase::function_type arg_dealloc = & deallocate
|
||||
);
|
||||
|
||||
public:
|
||||
|
||||
std::string get_label() const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
SharedAllocationRecord * allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::OpenMPTargetSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//TODO: implement all possible deep_copies
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,HostSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<HostSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::OpenMPTargetSpace >
|
||||
{
|
||||
enum { value = false };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -44,9 +44,10 @@
|
||||
#ifndef KOKKOS_QTHREADS_HPP
|
||||
#define KOKKOS_QTHREADS_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// Defines to enable experimental Qthreads functionality.
|
||||
#define QTHREAD_LOCAL_PRIORITY
|
||||
@ -150,6 +151,8 @@ public:
|
||||
|
||||
int shepherd_size() const;
|
||||
int shepherd_worker_size() const;
|
||||
|
||||
static const char* name();
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -194,5 +197,5 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
|
||||
|
||||
#endif // #define KOKKOS_ENABLE_QTHREADS
|
||||
|
||||
#endif // #define KOKKOS_QTHREADS_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,9 +44,9 @@
|
||||
#ifndef KOKKOS_SCRATCHSPACE_HPP
|
||||
#define KOKKOS_SCRATCHSPACE_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cstdio>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <Kokkos_Concepts.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -57,7 +57,7 @@ namespace Kokkos {
|
||||
*/
|
||||
template< class ExecSpace >
|
||||
class ScratchMemorySpace {
|
||||
static_assert (Impl::is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
|
||||
static_assert (is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
|
||||
public:
|
||||
|
||||
// Alignment of memory chunks returned by 'get'
|
||||
@ -161,6 +161,3 @@ public:
|
||||
|
||||
#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -47,6 +47,9 @@
|
||||
#ifndef KOKKOS_SERIAL_HPP
|
||||
#define KOKKOS_SERIAL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
@ -63,7 +66,6 @@
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -163,6 +165,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
|
||||
static const char* name();
|
||||
//--------------------------------------------------------------------------
|
||||
};
|
||||
|
||||
@ -515,7 +518,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -798,7 +801,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
@ -821,5 +824,3 @@ public:
|
||||
#endif // defined( KOKKOS_ENABLE_SERIAL )
|
||||
#endif /* #define KOKKOS_SERIAL_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -46,25 +46,10 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// If compiling with CUDA then must be using CUDA 8 or better
|
||||
// and use relocateable device code to enable the task policy.
|
||||
// nvcc relocatable device code option: --relocatable-device-code=true
|
||||
|
||||
#if ( defined( KOKKOS_ENABLE_CUDA ) )
|
||||
#if ( 8000 <= CUDA_VERSION ) && \
|
||||
defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
|
||||
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
@ -371,7 +356,7 @@ struct TaskPolicyData
|
||||
TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( DepFutureType && arg_future
|
||||
TaskPolicyData( DepFutureType const & arg_future
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( 0 )
|
||||
, m_dependence( arg_future )
|
||||
@ -385,6 +370,15 @@ struct TaskPolicyData
|
||||
, m_dependence()
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( scheduler_type const & arg_scheduler
|
||||
, DepFutureType const & arg_future
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( & arg_scheduler )
|
||||
, m_dependence( arg_future )
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -413,6 +407,7 @@ public:
|
||||
|
||||
using execution_space = ExecSpace ;
|
||||
using memory_space = typename queue_type::memory_space ;
|
||||
using memory_pool = typename queue_type::memory_pool ;
|
||||
using member_type =
|
||||
typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
|
||||
|
||||
@ -431,9 +426,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskScheduler & operator = ( TaskScheduler const & rhs ) = default ;
|
||||
|
||||
TaskScheduler( memory_space const & arg_memory_space
|
||||
, unsigned const arg_memory_pool_capacity
|
||||
, unsigned const arg_memory_pool_log2_superblock = 12 )
|
||||
TaskScheduler( memory_pool const & arg_memory_pool )
|
||||
: m_track()
|
||||
, m_queue(0)
|
||||
{
|
||||
@ -442,21 +435,37 @@ public:
|
||||
record_type ;
|
||||
|
||||
record_type * record =
|
||||
record_type::allocate( arg_memory_space
|
||||
record_type::allocate( memory_space()
|
||||
, "TaskQueue"
|
||||
, sizeof(queue_type)
|
||||
);
|
||||
|
||||
m_queue = new( record->data() )
|
||||
queue_type( arg_memory_space
|
||||
, arg_memory_pool_capacity
|
||||
, arg_memory_pool_log2_superblock );
|
||||
m_queue = new( record->data() ) queue_type( arg_memory_pool );
|
||||
|
||||
record->m_destroy.m_queue = m_queue ;
|
||||
|
||||
m_track.assign_allocated_record_to_uninitialized( record );
|
||||
}
|
||||
|
||||
TaskScheduler( memory_space const & arg_memory_space
|
||||
, size_t const mempool_capacity
|
||||
, unsigned const mempool_min_block_size // = 1u << 6
|
||||
, unsigned const mempool_max_block_size // = 1u << 10
|
||||
, unsigned const mempool_superblock_size // = 1u << 12
|
||||
)
|
||||
: TaskScheduler( memory_pool( arg_memory_space
|
||||
, mempool_capacity
|
||||
, mempool_min_block_size
|
||||
, mempool_max_block_size
|
||||
, mempool_superblock_size ) )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
memory_pool * memory() const noexcept
|
||||
{ return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Allocation size for a spawned task */
|
||||
template< typename FunctorType >
|
||||
@ -502,7 +511,12 @@ public:
|
||||
: (queue_type*) 0 );
|
||||
|
||||
if ( 0 == queue ) {
|
||||
Kokkos::abort("Kokkos spawn given null Future" );
|
||||
Kokkos::abort("Kokkos spawn requires scheduler or non-null Future");
|
||||
}
|
||||
|
||||
if ( arg_policy.m_dependence.m_task != 0 &&
|
||||
arg_policy.m_dependence.m_task->m_queue != queue ) {
|
||||
Kokkos::abort("Kokkos spawn given incompatible scheduler and Future");
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
@ -641,7 +655,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocation_capacity() const noexcept
|
||||
{ return m_queue->m_memory.get_mem_size(); }
|
||||
{ return m_queue->m_memory.capacity(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocated_task_count() const noexcept
|
||||
@ -696,6 +710,22 @@ TaskTeam( T const & arg
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
template< typename E , typename F >
|
||||
Kokkos::Impl::
|
||||
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskTeam( TaskScheduler<E> const & arg_scheduler
|
||||
, F const & arg_future
|
||||
, typename std::enable_if< Kokkos::is_future<F>::value ,
|
||||
TaskPriority >::type const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
|
||||
( arg_scheduler , arg_future , arg_priority );
|
||||
}
|
||||
|
||||
// Construct a TaskSingle execution policy
|
||||
|
||||
template< typename T >
|
||||
@ -721,6 +751,22 @@ TaskSingle( T const & arg
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
template< typename E , typename F >
|
||||
Kokkos::Impl::
|
||||
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskSingle( TaskScheduler<E> const & arg_scheduler
|
||||
, F const & arg_future
|
||||
, typename std::enable_if< Kokkos::is_future<F>::value ,
|
||||
TaskPriority >::type const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
|
||||
( arg_scheduler , arg_future , arg_priority );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/**\brief A host control thread spawns a task with options
|
||||
@ -849,3 +895,4 @@ void wait( TaskScheduler< ExecSpace > const & scheduler )
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */
|
||||
|
||||
|
||||
@ -44,9 +44,10 @@
|
||||
#ifndef KOKKOS_THREADS_HPP
|
||||
#define KOKKOS_THREADS_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
@ -178,6 +179,7 @@ public:
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
//@}
|
||||
//----------------------------------------
|
||||
};
|
||||
@ -190,7 +192,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::Threads::memory_space
|
||||
, Kokkos::Threads::scratch_memory_space
|
||||
>
|
||||
@ -227,6 +229,6 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
|
||||
#endif /* #define KOKKOS_THREADS_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,7 +44,7 @@
|
||||
#ifndef KOKKOS_TIMER_HPP
|
||||
#define KOKKOS_TIMER_HPP
|
||||
|
||||
#include <stddef.h>
|
||||
#include <cstddef>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#undef KOKKOS_ENABLE_LIBRT
|
||||
@ -110,3 +110,4 @@ public:
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_TIMER_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -51,3 +51,4 @@
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -625,7 +625,7 @@ private:
|
||||
|
||||
#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
|
||||
View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
|
||||
Kokkos::Impl::view_verify_operator_bounds ARG ;
|
||||
Kokkos::Impl::view_verify_operator_bounds< typename traits::memory_space > ARG ;
|
||||
|
||||
#else
|
||||
|
||||
@ -646,12 +646,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator()( Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
|
||||
return m_map.reference();
|
||||
}
|
||||
|
||||
@ -669,12 +664,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.reference(i0);
|
||||
}
|
||||
|
||||
@ -690,13 +680,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.m_handle[ i0 ];
|
||||
}
|
||||
|
||||
@ -712,12 +696,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
|
||||
}
|
||||
|
||||
@ -733,12 +712,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.reference(i0);
|
||||
}
|
||||
|
||||
@ -752,12 +726,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.m_handle[ i0 ];
|
||||
}
|
||||
|
||||
@ -771,12 +740,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
|
||||
}
|
||||
|
||||
@ -794,12 +758,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.reference(i0,i1);
|
||||
}
|
||||
|
||||
@ -815,12 +774,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
|
||||
}
|
||||
|
||||
@ -836,12 +790,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
|
||||
}
|
||||
|
||||
@ -857,12 +806,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
|
||||
}
|
||||
|
||||
@ -878,12 +822,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
|
||||
}
|
||||
|
||||
@ -899,12 +838,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
|
||||
i1 * m_map.m_offset.m_stride.S1 ];
|
||||
}
|
||||
@ -923,12 +857,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
|
||||
}
|
||||
|
||||
@ -943,12 +872,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
|
||||
return m_map.reference(i0,i1,i2);
|
||||
}
|
||||
|
||||
@ -966,12 +890,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
|
||||
}
|
||||
|
||||
@ -986,12 +905,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3);
|
||||
}
|
||||
|
||||
@ -1011,12 +925,7 @@ public:
|
||||
, const I4 & i4
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
|
||||
}
|
||||
|
||||
@ -1033,12 +942,7 @@ public:
|
||||
, const I4 & i4
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4);
|
||||
}
|
||||
|
||||
@ -1058,12 +962,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
|
||||
}
|
||||
|
||||
@ -1080,12 +979,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5);
|
||||
}
|
||||
|
||||
@ -1105,12 +999,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
|
||||
}
|
||||
|
||||
@ -1127,12 +1016,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
|
||||
}
|
||||
|
||||
@ -1152,12 +1036,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
|
||||
}
|
||||
|
||||
@ -1174,12 +1053,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
|
||||
}
|
||||
|
||||
|
||||
@ -9,14 +9,14 @@ default: messages build-lib
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?=
|
||||
LDFLAGS ?=
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
@ -66,7 +66,7 @@ else
|
||||
KOKKOS_DEBUG_CMAKE = ON
|
||||
endif
|
||||
|
||||
messages:
|
||||
messages:
|
||||
echo "Start Build"
|
||||
|
||||
build-makefile-kokkos:
|
||||
@ -80,13 +80,13 @@ build-makefile-kokkos:
|
||||
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
|
||||
echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
|
||||
echo "CXX ?= $(CXX)" >> Makefile.kokkos
|
||||
echo "CXX ?= $(CXX)" >> Makefile.kokkos
|
||||
echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
|
||||
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Variables used in application Makefiles" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
|
||||
@ -133,11 +133,11 @@ build-cmake-kokkos:
|
||||
echo "else()" >> kokkos.cmake
|
||||
echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
|
||||
echo "endif()" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
|
||||
echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Variables used in application Makefiles" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
@ -164,7 +164,7 @@ build-cmake-kokkos:
|
||||
|
||||
build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
mkdir:
|
||||
mkdir:
|
||||
mkdir -p $(PREFIX)
|
||||
mkdir -p $(PREFIX)/bin
|
||||
mkdir -p $(PREFIX)/include
|
||||
@ -187,7 +187,7 @@ copy-openmp: mkdir
|
||||
mkdir -p $(PREFIX)/include/OpenMP
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
|
||||
|
||||
install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
|
||||
|
||||
@ -41,7 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <cstdio>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
@ -51,7 +54,6 @@
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
@ -82,13 +84,13 @@ bool s_using_hwloc = false;
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
|
||||
int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPexec::verify_is_process( const char * const label )
|
||||
void OpenMPExec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
std::string msg( label );
|
||||
@ -97,7 +99,7 @@ void OpenMPexec::verify_is_process( const char * const label )
|
||||
}
|
||||
}
|
||||
|
||||
void OpenMPexec::verify_initialized( const char * const label )
|
||||
void OpenMPExec::verify_initialized( const char * const label )
|
||||
{
|
||||
if ( 0 == m_pool[0] ) {
|
||||
std::string msg( label );
|
||||
@ -122,7 +124,7 @@ void OpenMPexec::verify_initialized( const char * const label )
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void OpenMPexec::clear_thread_data()
|
||||
void OpenMPExec::clear_thread_data()
|
||||
{
|
||||
const size_t member_bytes =
|
||||
sizeof(int64_t) *
|
||||
@ -149,7 +151,7 @@ void OpenMPexec::clear_thread_data()
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
||||
void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
|
||||
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes )
|
||||
@ -231,7 +233,7 @@ namespace Kokkos {
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int OpenMP::is_initialized()
|
||||
{ return 0 != Impl::OpenMPexec::m_pool[0]; }
|
||||
{ return 0 != Impl::OpenMPExec::m_pool[0]; }
|
||||
|
||||
void OpenMP::initialize( unsigned thread_count ,
|
||||
unsigned use_numa_count ,
|
||||
@ -257,7 +259,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
|
||||
static int omp_max_threads = nthreads;
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
|
||||
bool thread_spawn_failed = false ;
|
||||
|
||||
@ -270,7 +272,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
|
||||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
|
||||
|
||||
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
|
||||
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
|
||||
|
||||
// If hwloc available then use it's maximum value.
|
||||
|
||||
@ -316,16 +318,16 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
|
||||
: omp_rank ;
|
||||
|
||||
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
}
|
||||
/* END #pragma omp critical */
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
if ( ! thread_spawn_failed ) {
|
||||
Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
|
||||
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
|
||||
// New, unified host thread team data:
|
||||
{
|
||||
@ -334,7 +336,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
size_t team_shared_bytes = 1024 * thread_count ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
@ -371,15 +373,15 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
|
||||
void OpenMP::finalize()
|
||||
{
|
||||
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
|
||||
Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
|
||||
|
||||
// New, unified host thread team data:
|
||||
Impl::OpenMPexec::clear_thread_data();
|
||||
Impl::OpenMPExec::clear_thread_data();
|
||||
|
||||
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
|
||||
Impl::OpenMPexec::m_pool_topo[2] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = 0 ;
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
@ -396,7 +398,7 @@ void OpenMP::finalize()
|
||||
|
||||
void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
{
|
||||
Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
|
||||
|
||||
s << "Kokkos::OpenMP" ;
|
||||
|
||||
@ -414,12 +416,12 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
;
|
||||
#endif
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
|
||||
if ( is_initialized ) {
|
||||
const int numa_count = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
|
||||
const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
|
||||
s << " thread_pool_topology[ " << numa_count
|
||||
<< " x " << core_per_numa
|
||||
@ -428,7 +430,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
<< std::endl ;
|
||||
|
||||
if ( detail ) {
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
@ -442,7 +444,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
|
||||
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
|
||||
s << " thread omp_rank[" << i << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
|
||||
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
|
||||
<< std::endl ;
|
||||
}
|
||||
@ -457,6 +459,11 @@ int OpenMP::concurrency() {
|
||||
return thread_pool_size(0);
|
||||
}
|
||||
|
||||
const char* OpenMP::name() { return "OpenMP"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif //KOKKOS_ENABLE_OPENMP
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_OPENMPEXEC_HPP
|
||||
#define KOKKOS_OPENMPEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
@ -66,7 +69,7 @@ namespace Impl {
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Data for OpenMP thread execution */
|
||||
|
||||
class OpenMPexec {
|
||||
class OpenMPExec {
|
||||
public:
|
||||
|
||||
friend class Kokkos::OpenMP ;
|
||||
@ -324,17 +327,21 @@ public:
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
inline
|
||||
bool OpenMP::in_parallel()
|
||||
{ return omp_in_parallel(); }
|
||||
|
||||
inline
|
||||
int OpenMP::thread_pool_size( int depth )
|
||||
{
|
||||
return Impl::OpenMPexec::pool_size(depth);
|
||||
return Impl::OpenMPExec::pool_size(depth);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMP::thread_pool_rank()
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
|
||||
return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
|
||||
#else
|
||||
return -1 ;
|
||||
#endif
|
||||
@ -342,4 +349,6 @@ int OpenMP::thread_pool_rank()
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,9 +44,12 @@
|
||||
#ifndef KOKKOS_OPENMP_PARALLEL_HPP
|
||||
#define KOKKOS_OPENMP_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
#include <iostream>
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -58,7 +61,7 @@ namespace Impl {
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::OpenMP
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -77,7 +80,7 @@ private:
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -94,7 +97,7 @@ private:
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -111,12 +114,12 @@ public:
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
@ -200,11 +203,6 @@ private:
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update );
|
||||
}
|
||||
@ -218,11 +216,6 @@ private:
|
||||
, reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update );
|
||||
}
|
||||
@ -235,13 +228,13 @@ public:
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_bytes =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
@ -249,7 +242,7 @@ public:
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
@ -282,12 +275,12 @@ public:
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
@ -305,7 +298,7 @@ public:
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ViewType & arg_result_view
|
||||
, const ViewType & arg_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
@ -313,7 +306,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
, m_result_ptr( arg_view.data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -327,7 +320,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -378,11 +371,6 @@ private:
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update , final );
|
||||
}
|
||||
@ -396,11 +384,6 @@ private:
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update , final );
|
||||
}
|
||||
@ -411,13 +394,13 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
const int value_count = Analysis::value_count( m_functor );
|
||||
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
@ -425,7 +408,7 @@ public:
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
|
||||
|
||||
@ -565,22 +548,22 @@ public:
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t pool_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
@ -723,8 +706,8 @@ public:
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
@ -733,14 +716,14 @@ public:
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
@ -785,12 +768,12 @@ public:
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
@ -830,7 +813,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >
|
||||
@ -849,5 +832,6 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,15 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
@ -110,21 +111,27 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
||||
static task_root_type * const end =
|
||||
(task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
|
||||
HostThreadTeamData & team_data_single =
|
||||
HostThreadTeamDataSingleton::singleton();
|
||||
|
||||
const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
|
||||
const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
|
||||
, 512 * team_size /* team reduce buffer */
|
||||
, 0 /* team shared buffer */
|
||||
, 0 /* thread local buffer */
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
|
||||
|
||||
// Organizing threads into a team performs a barrier across the
|
||||
// entire pool to insure proper initialization of the team
|
||||
@ -164,7 +171,7 @@ fflush(stdout);
|
||||
if ( 0 != task && end != task ) {
|
||||
// team member #0 completes the previously executed task,
|
||||
// completion may delete the task
|
||||
queue->complete( task );
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// If 0 == m_ready_count then set task = 0
|
||||
@ -301,7 +308,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
@ -310,7 +317,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,6 +44,7 @@
|
||||
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
306
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
Normal file
306
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
Normal file
@ -0,0 +1,306 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <omp.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/* Default allocation mechanism */
|
||||
OpenMPTargetSpace::OpenMPTargetSpace()
|
||||
{}
|
||||
|
||||
void * OpenMPTargetSpace::allocate( const size_t arg_alloc_size ) const
|
||||
{
|
||||
static_assert( sizeof(void*) == sizeof(uintptr_t)
|
||||
, "Error sizeof(void*) != sizeof(uintptr_t)" );
|
||||
|
||||
void * ptr;
|
||||
|
||||
ptr = omp_target_alloc( arg_alloc_size, omp_get_default_device());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
void OpenMPTargetSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
|
||||
{
|
||||
if ( arg_alloc_ptr ) {
|
||||
|
||||
omp_target_free( arg_alloc_ptr , omp_get_default_device() );
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
SharedAllocationRecord< void , void >
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record ;
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
}
|
||||
|
||||
//TODO: Implement deep copy back see CudaSpace
|
||||
std::string
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_label() const {
|
||||
return std::string("OpenMPTargetAllocation");
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >*
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space,
|
||||
const std::string & arg_label ,
|
||||
const size_t arg_alloc_size) {
|
||||
return new SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >(arg_space,arg_label,arg_alloc_size);
|
||||
}
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
deallocate( SharedAllocationRecord<void, void> * arg_rec )
|
||||
{
|
||||
delete static_cast<SharedAllocationRecord*>(arg_rec);
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
|
||||
)
|
||||
// Pass through allocated [ SharedAllocationHeader , user_memory ]
|
||||
// Pass through deallocation function
|
||||
: SharedAllocationRecord< void , void >
|
||||
( & SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record
|
||||
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
|
||||
, sizeof(SharedAllocationHeader) + arg_alloc_size
|
||||
, arg_dealloc
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
SharedAllocationHeader header;
|
||||
|
||||
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
|
||||
|
||||
strncpy( header.m_label
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
|
||||
//TODO DeepCopy
|
||||
// DeepCopy
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
//Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() , r_old->data()
|
||||
// , std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
typedef SharedAllocationHeader Header ;
|
||||
typedef SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > RecordHost ;
|
||||
|
||||
SharedAllocationHeader const * const head = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
|
||||
RecordHost * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
return record ;
|
||||
}
|
||||
|
||||
// Iterate records to print orphaned memory ...
|
||||
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::Experimental::OpenMPTargetSpace & space , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "OpenMPTargetSpace" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class >
|
||||
struct ViewOperatorBoundsErrorAbort ;
|
||||
|
||||
template<>
|
||||
struct ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace > {
|
||||
static void apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 );
|
||||
};
|
||||
|
||||
void ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace >::
|
||||
apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 )
|
||||
{
|
||||
printf( "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
|
||||
, rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
|
||||
, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
|
||||
//Kokkos::Impl::throw_runtime_exception( buffer );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
|
||||
const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
|
||||
static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
void init_lock_array_host_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized)
|
||||
for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
|
||||
HOST_SPACE_ATOMIC_LOCKS[i] = 0;
|
||||
}
|
||||
|
||||
bool lock_address_host_space(void* ptr) {
|
||||
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0 , 1);
|
||||
}
|
||||
|
||||
void unlock_address_host_space(void* ptr) {
|
||||
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0);
|
||||
}
|
||||
|
||||
}
|
||||
}*/
|
||||
273
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Normal file
273
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Normal file
@ -0,0 +1,273 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <iostream>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel();
|
||||
|
||||
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel()
|
||||
{
|
||||
#ifndef __CUDA_ARCH__
|
||||
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool s_using_hwloc = false;
|
||||
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
bool OpenMPTarget::m_is_initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
//int OpenMPTargetExec::m_map_rank[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
//int OpenMPTargetExec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
//OpenMPTargetExec * OpenMPTargetExec::m_pool[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPTargetExec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: in parallel" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
}
|
||||
|
||||
void OpenMPTargetExec::verify_initialized( const char * const label )
|
||||
{
|
||||
if ( 0 == Kokkos::Experimental::OpenMPTarget::is_initialized() ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: not initialized" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
if ( omp_get_max_threads() != Kokkos::Experimental::OpenMPTarget::thread_pool_size(0) ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: Initialized but threads modified inappropriately" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void* OpenMPTargetExec::m_scratch_ptr = NULL;
|
||||
int64_t OpenMPTargetExec::m_scratch_size = 0;
|
||||
|
||||
void OpenMPTargetExec::clear_scratch()
|
||||
{
|
||||
Kokkos::Experimental::OpenMPTargetSpace space;
|
||||
space.deallocate(m_scratch_ptr,m_scratch_size);
|
||||
m_scratch_ptr = NULL;
|
||||
m_scratch_size = NULL;
|
||||
}
|
||||
|
||||
void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
|
||||
|
||||
void OpenMPTargetExec::resize_scratch( int64_t reduce_bytes ,
|
||||
int64_t team_reduce_bytes,
|
||||
int64_t team_shared_bytes, int64_t thread_local_bytes)
|
||||
{
|
||||
Kokkos::Experimental::OpenMPTargetSpace space;
|
||||
uint64_t total_size = MAX_ACTIVE_TEAMS * reduce_bytes + // Inter Team Reduction
|
||||
MAX_ACTIVE_TEAMS * team_reduce_bytes + // Intra Team Reduction
|
||||
MAX_ACTIVE_TEAMS * team_shared_bytes + // Team Local Scratch
|
||||
MAX_ACTIVE_THREADS * thread_local_bytes; // Thread Private Scratch
|
||||
|
||||
if( total_size > m_scratch_size ) {
|
||||
space.deallocate(m_scratch_ptr,m_scratch_size);
|
||||
m_scratch_size = total_size;
|
||||
m_scratch_ptr = space.allocate(total_size);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int OpenMPTarget::is_initialized()
|
||||
{ return m_is_initialized; }// != Impl::OpenMPTargetExec::m_pool[0]; }
|
||||
|
||||
void OpenMPTarget::initialize( unsigned thread_count ,
|
||||
unsigned use_numa_count ,
|
||||
unsigned use_cores_per_numa )
|
||||
{
|
||||
// Before any other call to OMP query the maximum number of threads
|
||||
// and save the value for re-initialization unit testing.
|
||||
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Kokkos::Impl::init_lock_array_host_space();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PROFILING
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
m_is_initialized = true;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void OpenMPTarget::finalize()
|
||||
{
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_initialized( "OpenMPTarget::finalize" );
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::finalize" );
|
||||
|
||||
m_is_initialized = false;
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
if ( Kokkos::Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
|
||||
hwloc::unbind_this_thread();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PROFILING
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void OpenMPTarget::print_configuration( std::ostream & s , const bool detail )
|
||||
{
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::print_configuration" );
|
||||
/*
|
||||
s << "Kokkos::Experimental::OpenMPTarget" ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
s << " KOKKOS_ENABLE_OPENMPTARGET" ;
|
||||
#endif
|
||||
#if defined( KOKKOS_HAVE_HWLOC )
|
||||
|
||||
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
|
||||
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
|
||||
|
||||
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
|
||||
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
|
||||
;
|
||||
#endif
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPTargetExec::m_pool[0] ;
|
||||
|
||||
if ( is_initialized ) {
|
||||
const int numa_count = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
|
||||
|
||||
s << " thread_pool_topology[ " << numa_count
|
||||
<< " x " << core_per_numa
|
||||
<< " x " << thread_per_core
|
||||
<< " ]"
|
||||
<< std::endl ;
|
||||
|
||||
if ( detail ) {
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp critical
|
||||
{
|
||||
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
|
||||
}
|
||||
// END #pragma omp critical
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
|
||||
s << " thread omp_rank[" << i << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPTargetExec::m_map_rank[ i ] << "]"
|
||||
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
s << " not initialized" << std::endl ;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
int OpenMPTarget::concurrency() {
|
||||
return thread_pool_size(0);
|
||||
}
|
||||
|
||||
const char* OpenMPTarget::name() { return "OpenMPTarget"; }
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif //KOKKOS_ENABLE_OPENMPTARGET
|
||||
727
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
Normal file
727
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
Normal file
@ -0,0 +1,727 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
|
||||
#define KOKKOS_OPENMPTARGETEXEC_HPP
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Data for OpenMPTarget thread execution */
|
||||
|
||||
|
||||
class OpenMPTargetExec {
|
||||
public:
|
||||
enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
|
||||
enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
|
||||
|
||||
private:
|
||||
static void* scratch_ptr;
|
||||
|
||||
public:
|
||||
static void verify_is_process( const char * const );
|
||||
static void verify_initialized( const char * const );
|
||||
|
||||
static void* get_scratch_ptr();
|
||||
static void clear_scratch();
|
||||
static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
|
||||
|
||||
static void* m_scratch_ptr;
|
||||
static int64_t m_scratch_size;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class OpenMPTargetExecTeamMember {
|
||||
public:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
/** \brief Thread states for team synchronization */
|
||||
enum { Active = 0 , Rendezvous = 1 };
|
||||
|
||||
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_scratch_size[2] ;
|
||||
int m_team_rank ;
|
||||
int m_team_size ;
|
||||
int m_league_rank ;
|
||||
int m_league_size ;
|
||||
int m_vector_length ;
|
||||
int m_vector_lane ;
|
||||
void* m_glb_scratch ;
|
||||
|
||||
/*
|
||||
// Fan-in team threads, root of the fan-in which does not block returns true
|
||||
inline
|
||||
bool team_fan_in() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
|
||||
}
|
||||
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec.state_set( Rendezvous );
|
||||
memory_fence();
|
||||
m_exec.state_wait( Rendezvous );
|
||||
}
|
||||
|
||||
return 0 == m_team_rank_rev ;
|
||||
}
|
||||
|
||||
inline
|
||||
void team_fan_out() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
|
||||
memory_fence();
|
||||
}
|
||||
}
|
||||
*/
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& thread_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
{
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast(ValueType& value, const int& thread_id) const
|
||||
{
|
||||
/*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ }
|
||||
#else
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
|
||||
, ValueType , void >::type type ;
|
||||
|
||||
type * const local_value = ((type*) m_exec.scratch_thread());
|
||||
if(team_rank() == thread_id)
|
||||
*local_value = value;
|
||||
memory_fence();
|
||||
team_barrier();
|
||||
value = *local_value;
|
||||
#endif*/
|
||||
}
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION ValueType
|
||||
team_reduce( const ValueType & value
|
||||
, const JoinOp & op_in ) const {
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
typedef ValueType value_type;
|
||||
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
|
||||
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
|
||||
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
|
||||
for(int i = m_team_rank; i < n_values; i+= m_team_size) {
|
||||
team_scratch[i] = value_type();
|
||||
}
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
for(int k=0; k<m_team_size; k+=n_values) {
|
||||
if((k <= m_team_rank) && (k+n_values > m_team_rank))
|
||||
team_scratch[m_team_rank%n_values]+=value;
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
for(int d = 1; d<n_values;d*=2) {
|
||||
if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
|
||||
team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
return team_scratch[0];
|
||||
}
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename ArgType >
|
||||
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
|
||||
{
|
||||
/* // Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
|
||||
|
||||
volatile type * const work_value = ((type*) m_exec.scratch_thread());
|
||||
|
||||
*work_value = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
// m_team_base[0] == highest ranking team member
|
||||
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
|
||||
//
|
||||
// 1) copy from lower to higher rank, initialize lowest rank to zero
|
||||
// 2) prefix sum from lowest to highest rank, skipping lowest rank
|
||||
|
||||
type accum = 0 ;
|
||||
|
||||
if ( global_accum ) {
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
accum += val ;
|
||||
}
|
||||
accum = atomic_fetch_add( global_accum , accum );
|
||||
}
|
||||
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
const type offset = accum ;
|
||||
accum += val ;
|
||||
val = offset ;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
return *work_value ;*/
|
||||
return ArgType();
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
|
||||
{ return this-> template team_scan<Type>( value , 0 ); }
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
private:
|
||||
|
||||
typedef execution_space::scratch_memory_space space ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
OpenMPTargetExecTeamMember( const int league_rank, const int league_size, const int team_size, const int vector_length //const TeamPolicyInternal< OpenMPTarget, Properties ...> & team
|
||||
, void* const glb_scratch
|
||||
, const int shmem_size_L1
|
||||
, const int shmem_size_L2
|
||||
)
|
||||
: m_team_shared(0,0)
|
||||
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
|
||||
, m_team_rank(0)
|
||||
, m_vector_length( vector_length )
|
||||
, m_team_size( team_size )
|
||||
, m_league_rank( league_rank )
|
||||
, m_league_size( league_size )
|
||||
, m_glb_scratch( glb_scratch )
|
||||
{
|
||||
const int omp_tid = omp_get_thread_num();
|
||||
m_league_rank = league_rank;
|
||||
m_team_rank = omp_tid/m_vector_length;
|
||||
m_vector_lane = omp_tid%m_vector_length;
|
||||
}
|
||||
|
||||
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
|
||||
};
|
||||
|
||||
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >: public PolicyTraits<Properties ...>
|
||||
{
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy ;
|
||||
|
||||
typedef PolicyTraits<Properties ... > traits;
|
||||
|
||||
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_size = p.m_team_size;
|
||||
m_vector_length = p.m_vector_length;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return 1024; }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & )
|
||||
{ return 256; }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType &, const int& vector_length)
|
||||
{ return 256/vector_length; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
int m_league_size ;
|
||||
int m_team_size ;
|
||||
int m_vector_length;
|
||||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request
|
||||
, const int vector_length_request )
|
||||
{
|
||||
m_league_size = league_size_request ;
|
||||
|
||||
m_team_size = team_size_request;
|
||||
|
||||
m_vector_length = vector_length_request;
|
||||
|
||||
set_auto_chunk_size();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline int vector_length() const { return m_vector_length ; }
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
|
||||
if(team_size_ < 0)
|
||||
team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request , vector_length_request); }
|
||||
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int vector_length_request = 1)
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, int team_size_request
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request , vector_length_request); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
|
||||
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
inline int chunk_size() const { return m_chunk_size ; }
|
||||
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
|
||||
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
|
||||
if(m_chunk_size > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_chunk_size ))
|
||||
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
|
||||
}
|
||||
|
||||
int new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_league_size)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_chunk_size = new_chunk_size;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef Impl::OpenMPTargetExecTeamMember member_type ;
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
inline
|
||||
int OpenMPTarget::thread_pool_size( int depth )
|
||||
{
|
||||
//return Impl::OpenMPTargetExec::pool_size(depth);
|
||||
return omp_get_max_threads();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMPTarget::thread_pool_rank()
|
||||
{
|
||||
return omp_get_thread_num();
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(const Impl::OpenMPTargetExecTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(const Impl::OpenMPTargetExecTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
|
||||
//result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
|
||||
//init_result = loop_boundaries.thread.team_reduce(result,join);
|
||||
}
|
||||
|
||||
} //namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
init_result = result;
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
* for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
|
||||
* Depending on the target execution space the operator might be called twice: once with final=false
|
||||
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
|
||||
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
|
||||
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
|
||||
* to the final sum value over all vector lanes.
|
||||
* This functionality requires C++11 support.*/
|
||||
template< typename iType, class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const FunctorType & lambda) {
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
value_type scan_val = value_type();
|
||||
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,scan_val,true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
if(single_struct.team_member.team_rank()==0) lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
if(single_struct.team_member.team_rank()==0) {
|
||||
lambda(val);
|
||||
}
|
||||
single_struct.team_member.team_broadcast(val,0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
|
||||
|
||||
@ -0,0 +1,767 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
|
||||
#define KOKKOS_OPENMPTARGET_PARALLEL_HPP
|
||||
|
||||
#include <omp.h>
|
||||
#include <iostream>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
execute_impl<WorkTag>();
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename Policy::member_type begin = m_policy.begin();
|
||||
const typename Policy::member_type end = m_policy.end();
|
||||
|
||||
#pragma omp target teams distribute parallel for map(to:this->m_functor)
|
||||
for(int i=begin; i<end; i++)
|
||||
m_functor(i);
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename Policy::member_type begin = m_policy.begin();
|
||||
const typename Policy::member_type end = m_policy.end();
|
||||
|
||||
#pragma omp target teams distribute parallel for num_threads(128) map(to:this->m_functor)
|
||||
for(int i=begin; i<end; i++)
|
||||
m_functor(TagType(),i);
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType, int FunctorHasJoin, int UseReducerType>
|
||||
struct ParallelReduceSpecialize {
|
||||
static inline void execute(const FunctorType& f, const PolicyType& p , PointerType result_ptr) {
|
||||
printf("Error: Invalid Specialization %i %i\n",FunctorHasJoin,UseReducerType);
|
||||
}
|
||||
};
|
||||
|
||||
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
|
||||
struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
|
||||
typedef Kokkos::RangePolicy<PolicyArgs...> PolicyType;
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(TagType(),i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
/*
|
||||
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType>
|
||||
struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, PointerType, ValueType, 0,1> {
|
||||
|
||||
#pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, omp_in)) initializer ( ReducerType::init(omp_priv) )
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(custom: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(custom: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(TagType(),i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ...>
|
||||
, ReducerType
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
// Static Assert WorkTag void if ReducerType not InvalidType
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
|
||||
enum {UseReducer = is_reducer_type<ReducerType>::value };
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
public:
|
||||
inline void execute() const {
|
||||
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ViewType & arg_result_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
/*
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update , final );
|
||||
}
|
||||
}
|
||||
*/
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
/* OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_scan");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_scan");
|
||||
|
||||
OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr =
|
||||
pointer_type( exec.scratch_reduce() ) +
|
||||
ValueTraits::value_count( m_functor );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , ptr ) , false );
|
||||
}
|
||||
|
||||
{
|
||||
const unsigned thread_count = OpenMPTargetExec::pool_size();
|
||||
const unsigned value_count = ValueTraits::value_count( m_functor );
|
||||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
|
||||
|
||||
pointer_type ptr = pointer_type( OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
|
||||
|
||||
if ( ptr_prev ) {
|
||||
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
|
||||
ValueJoin::join( m_functor , ptr + value_count , ptr );
|
||||
}
|
||||
else {
|
||||
ValueInit::init( m_functor , ptr );
|
||||
}
|
||||
|
||||
ptr_prev = ptr ;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueOps::reference( ptr ) , true );
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
execute_impl<WorkTag>();
|
||||
}
|
||||
|
||||
private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const int league_size = m_policy.league_size();
|
||||
const int team_size = m_policy.team_size();
|
||||
const int vector_length = m_policy.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
|
||||
map(to:this->m_functor,scratch_ptr)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
m_functor(team);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const int league_size = m_policy.league_size();
|
||||
const int team_size = m_policy.team_size();
|
||||
const int vector_length = m_policy.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
|
||||
map(to:this->m_functor,scratch_ptr)
|
||||
for(int i=0 ; i<league_size ; i++) {
|
||||
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
m_functor(TagType(), team);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
|
||||
struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
|
||||
typedef TeamPolicyInternal<PolicyArgs...> PolicyType;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
|
||||
const int league_size = p.league_size();
|
||||
const int team_size = p.team_size();
|
||||
const int vector_length = p.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
|
||||
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
f(team,result);
|
||||
if(team.m_vector_lane!=0) result = 0;
|
||||
}
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
|
||||
const int league_size = p.league_size();
|
||||
const int team_size = p.team_size();
|
||||
const int vector_length = p.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
|
||||
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
f(TagType(),team,result);
|
||||
if(team.vector_lane!=0) result = 0;
|
||||
}
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ReducerType, class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, ReducerType
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
|
||||
enum {UseReducer = is_reducer_type<ReducerType>::value };
|
||||
|
||||
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const {
|
||||
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
const iType increment;
|
||||
|
||||
inline
|
||||
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
|
||||
start( thread_.team_rank() ),
|
||||
end( count ),
|
||||
increment( thread_.team_size() )
|
||||
{}
|
||||
inline
|
||||
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
|
||||
start( begin_+thread_.team_rank() ),
|
||||
end( end_ ),
|
||||
increment( thread_.team_size() )
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
const iType increment;
|
||||
|
||||
inline
|
||||
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
|
||||
start( thread_.m_vector_lane ),
|
||||
end( count ),
|
||||
increment( thread_.m_vector_length )
|
||||
{}
|
||||
inline
|
||||
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
|
||||
start( begin_+thread_.m_vector_lane ),
|
||||
end( end_ ),
|
||||
increment( thread_.m_vector_length )
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
|
||||
|
||||
329
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
Normal file
329
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
Normal file
@ -0,0 +1,329 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
TaskExec< Kokkos::Experimental::OpenMPTarget >::
|
||||
TaskExec()
|
||||
: m_self_exec( 0 )
|
||||
, m_team_exec( 0 )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( 0 )
|
||||
, m_team_rank( 0 )
|
||||
, m_team_size( 1 )
|
||||
{
|
||||
}
|
||||
|
||||
TaskExec< Kokkos::Experimental::OpenMPTarget >::
|
||||
TaskExec( Kokkos::Impl::OpenMPTargetExec & arg_exec , int const arg_team_size )
|
||||
: m_self_exec( & arg_exec )
|
||||
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
|
||||
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
|
||||
, m_team_size( arg_team_size )
|
||||
{
|
||||
// This team spans
|
||||
// m_self_exec->pool_rev( team_size * group_rank )
|
||||
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
|
||||
|
||||
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
|
||||
|
||||
sync[0] = int64_t(0) ;
|
||||
sync[1] = int64_t(0) ;
|
||||
|
||||
for ( int i = 0 ; i < m_team_size ; ++i ) {
|
||||
m_sync_value |= int64_t(1) << (8*i);
|
||||
m_sync_mask |= int64_t(3) << (8*i);
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void TaskExec< Kokkos::Experimental::OpenMPTarget >::team_barrier_impl() const
|
||||
{
|
||||
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
|
||||
Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small");
|
||||
}
|
||||
|
||||
// Use team shared memory to synchronize.
|
||||
// Alternate memory locations between barriers to avoid a sequence
|
||||
// of barriers overtaking one another.
|
||||
|
||||
int64_t volatile * const sync =
|
||||
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
|
||||
|
||||
// This team member sets one byte within the sync variable
|
||||
int8_t volatile * const sync_self =
|
||||
((int8_t *) sync) + m_team_rank ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
|
||||
|
||||
while ( m_sync_value != *sync ); // wait for team to arrive
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
++m_sync_step ;
|
||||
|
||||
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
|
||||
m_sync_value ^= m_sync_mask ;
|
||||
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::execute
|
||||
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
// Required: team_size <= 8
|
||||
|
||||
const int team_size = PoolExec::pool_size(2); // Threads per core
|
||||
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
if ( 8 < team_size ) {
|
||||
Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size");
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
PoolExec & self = *PoolExec::get_thread_omp();
|
||||
|
||||
Member single_exec ;
|
||||
Member team_exec( self , team_size );
|
||||
|
||||
// Team shared memory
|
||||
task_root_type * volatile * const task_shared =
|
||||
(task_root_type **) team_exec.m_team_exec->scratch_thread();
|
||||
|
||||
// Barrier across entire OpenMPTarget thread pool to insure initialization
|
||||
#pragma omp barrier
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
task_root_type * task = 0 ;
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or a single thread task for the team.
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Team lead broadcast acquired task to team members:
|
||||
|
||||
if ( 1 < team_exec.team_size() ) {
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
|
||||
|
||||
// Fence to be sure task_shared is stored before the barrier
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Whole team waits for every team member to reach this statement
|
||||
team_exec.team_barrier();
|
||||
|
||||
// Fence to be sure task_shared is stored
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task = *task_shared ;
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
|
||||
, team_exec.m_group_rank
|
||||
, team_exec.m_team_rank
|
||||
, uintptr_t(task_shared)
|
||||
, uintptr_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 0 == task ) break ; // 0 == m_ready_count
|
||||
|
||||
if ( end == task ) {
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
else if ( task_root_type::TaskTeam == task->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task->m_apply)( task , & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
// team member #0 completes the task, which may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single Thread Task
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Not necessary to complete the task.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::
|
||||
iff_single_thread_recursive_execute
|
||||
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec ;
|
||||
|
||||
task_root_type * task = end ;
|
||||
|
||||
do {
|
||||
|
||||
task = end ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( end == task ) break ;
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
}
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
||||
356
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
Normal file
356
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
Normal file
@ -0,0 +1,356 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const );
|
||||
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::Experimental::OpenMPTarget
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::Experimental::OpenMPTarget >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
|
||||
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
|
||||
int64_t m_sync_mask ;
|
||||
int64_t mutable m_sync_value ;
|
||||
int mutable m_sync_step ;
|
||||
int m_group_rank ; ///< Which "team" subset of thread pool
|
||||
int m_team_rank ; ///< Which thread within a team
|
||||
int m_team_size ;
|
||||
|
||||
TaskExec();
|
||||
TaskExec( PoolExec & arg_exec , int arg_team_size );
|
||||
|
||||
void team_barrier_impl() const ;
|
||||
|
||||
public:
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void * team_shared() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
|
||||
|
||||
int team_shared_size() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
|
||||
|
||||
/**\brief Whole team enters this function call
|
||||
* before any teeam member returns from
|
||||
* this function call.
|
||||
*/
|
||||
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_rank() const { return m_team_rank ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_size() const { return m_team_size ; }
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >
|
||||
TeamThreadRange
|
||||
( Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >
|
||||
TeamThreadRange
|
||||
( Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
|
||||
, const iType & start
|
||||
, const iType & end )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,start,end);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iType, class Lambda, typename ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
, ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
shared[0] += shared[i];
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
join(shared[0], shared[i]);
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, local_total;
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
int team_size = loop_boundaries.thread.team_size();
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
|
||||
// Intra-member scan
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
|
||||
shared[team_rank] = accum;
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Member 0 do scan on accumulated totals
|
||||
if (team_rank == 0) {
|
||||
for( iType i = 1; i < team_size; i+=1) {
|
||||
shared[i] += shared[i-1];
|
||||
}
|
||||
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Inter-member scan adding in accumulated totals
|
||||
if (team_rank != 0) { accum = shared[team_rank-1]; }
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
|
||||
|
||||
@ -41,12 +41,13 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
@ -215,6 +216,8 @@ void Qthreads::fence()
|
||||
int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
|
||||
int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
|
||||
|
||||
const char* Qthreads::name() { return "Qthreads"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -516,4 +519,7 @@ QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMemb
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_SRC_QTHREADS_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_QTHREADSEXEC_HPP
|
||||
#define KOKKOS_QTHREADSEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -637,4 +640,6 @@ public:
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif // #define KOKKOS_QTHREADSEXEC_HPP
|
||||
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
#define KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
@ -724,4 +727,6 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& sing
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -41,10 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -314,7 +314,7 @@ void TaskQueueSpecialization< Kokkos::Qthreads >::
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_SRC_QTHREADS_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,8 @@
|
||||
#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
#define KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,20 +36,22 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD.
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -448,7 +450,7 @@ TaskPolicy< Kokkos::Qthreads >::
|
||||
TaskPolicy
|
||||
( const unsigned /* arg_task_max_count */
|
||||
, const unsigned /* arg_task_max_size */
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_team_size
|
||||
)
|
||||
: m_default_dependence_capacity( arg_task_default_dependence_capacity )
|
||||
@ -484,5 +486,8 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_QTHREADS_KOKKOS_QTHREADS_TASKPOLICY_PREVENT_LINK_ERROR() {}
|
||||
#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -46,6 +46,9 @@
|
||||
#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <stdexcept>
|
||||
@ -69,7 +72,6 @@
|
||||
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -505,7 +507,7 @@ public:
|
||||
|
||||
template< class ValueType >
|
||||
const Future< ValueType , execution_space > &
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
@ -632,7 +634,7 @@ public:
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
void respawn( FunctorType * task_functor
|
||||
void respawn( FunctorType * task_functor
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
@ -41,7 +41,11 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#ifndef KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
#define KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -60,7 +64,7 @@ private:
|
||||
using execution_space = Kokkos::Qthread ;
|
||||
using memory_space = Kokkos::HostSpace
|
||||
using device_type = Kokkos::Device< execution_space, memory_space > ;
|
||||
using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ;
|
||||
using memory_pool = Kokkos::MemoryPool< device_type > ;
|
||||
using task_root_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
|
||||
|
||||
friend class Kokkos::TaskScheduler< execution_space > ;
|
||||
@ -317,3 +321,5 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif // KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
|
||||
|
||||
@ -41,7 +41,11 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#ifndef KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
#define KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
@ -433,4 +437,5 @@ fflush(stdout);
|
||||
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif // KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
|
||||
|
||||
@ -41,11 +41,13 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
@ -512,8 +514,8 @@ void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
|
||||
|
||||
s << "Kokkos::Threads" ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
s << " KOKKOS_ENABLE_PTHREAD" ;
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
s << " KOKKOS_ENABLE_THREADS" ;
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_HWLOC )
|
||||
s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
|
||||
@ -817,10 +819,12 @@ int Threads::thread_pool_rank()
|
||||
}
|
||||
#endif
|
||||
|
||||
const char* Threads::name() { return "Threads"; }
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD ) */
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
|
||||
|
||||
|
||||
@ -44,7 +44,10 @@
|
||||
#ifndef KOKKOS_THREADSEXEC_HPP
|
||||
#define KOKKOS_THREADSEXEC_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
@ -626,6 +629,6 @@ inline void Threads::fence()
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADSEXEC_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,18 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
/* Standard 'C' Linux libraries */
|
||||
|
||||
#include <pthread.h>
|
||||
@ -154,6 +152,8 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )
|
||||
|
||||
#elif defined( KOKKOS_ENABLE_WINTHREAD )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
/* Windows libraries */
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
@ -247,9 +247,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
|
||||
#endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,10 @@
|
||||
#ifndef KOKKOS_THREADSTEAM_HPP
|
||||
#define KOKKOS_THREADSTEAM_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
@ -173,7 +176,9 @@ public:
|
||||
}
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< !Kokkos::is_reducer< Type >::value , Type>::type
|
||||
team_reduce( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
@ -202,6 +207,55 @@ public:
|
||||
}
|
||||
#endif
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
team_reduce( const ReducerType & ) const
|
||||
{}
|
||||
#else
|
||||
team_reduce( const ReducerType & reducer ) const
|
||||
{
|
||||
typedef typename ReducerType::value_type value_type;
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
if ( 0 == m_exec ) return ;
|
||||
|
||||
type * const local_value = ((type*) m_exec->scratch_memory());
|
||||
|
||||
// Set this thread's contribution
|
||||
*local_value = reducer.reference() ;
|
||||
|
||||
// Fence to make sure the base team member has access:
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
type * const team_value = ((type*) m_team_base[0]->scratch_memory());
|
||||
|
||||
// Join to the team value:
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
reducer.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
|
||||
}
|
||||
|
||||
// Team base thread may "lap" member threads so copy out to their local value.
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
*((type*) m_team_base[i]->scratch_memory()) = *team_value ;
|
||||
}
|
||||
|
||||
// Fence to make sure all team members have access
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
// Value was changed by the team base
|
||||
reducer.reference() = *((type volatile const *) local_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION ValueType
|
||||
team_reduce( const ValueType & value
|
||||
@ -760,7 +814,8 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thread
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
@ -774,6 +829,21 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thr
|
||||
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const ReducerType& reducer) {
|
||||
|
||||
reducer.init(reducer.reference());
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_reduce(reducer);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -822,17 +892,26 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Thre
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const ReducerType& reducer) {
|
||||
reducer.init(reducer.reference());
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -915,6 +994,6 @@ void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADSTEAM_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,8 +44,11 @@
|
||||
#ifndef KOKKOS_THREADS_PARALLEL_HPP
|
||||
#define KOKKOS_THREADS_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <iostream>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
@ -83,7 +86,7 @@ private:
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -99,7 +102,7 @@ private:
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -308,7 +311,7 @@ private:
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -325,7 +328,7 @@ private:
|
||||
, reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -348,7 +351,7 @@ private:
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
@ -428,7 +431,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -543,7 +546,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
@ -584,7 +587,7 @@ private:
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -601,7 +604,7 @@ private:
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -654,5 +657,6 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -44,24 +44,20 @@
|
||||
#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
|
||||
#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#define KOKKOS_MDRANGE_IVDEP
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef KOKKOS_MDRANGE_IVDEP
|
||||
#define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
|
||||
#else
|
||||
#define KOKKOS_ENABLE_IVDEP_MDRANGE
|
||||
#endif
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
|
||||
@ -243,7 +239,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
APPLY( func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -255,7 +251,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -267,7 +263,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -279,7 +275,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -291,7 +287,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -303,7 +299,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -315,7 +311,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -327,7 +323,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -459,7 +455,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
APPLY_REDUX( val, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -471,7 +467,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -483,7 +479,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -495,7 +491,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -507,7 +503,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -519,7 +515,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -531,7 +527,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -543,7 +539,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -676,7 +672,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
TAGGED_APPLY( tag, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -688,7 +684,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -700,7 +696,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -712,7 +708,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -724,7 +720,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -736,7 +732,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -748,7 +744,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -760,7 +756,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -892,7 +888,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -904,7 +900,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -916,7 +912,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -928,7 +924,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -940,7 +936,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -952,7 +948,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -964,7 +960,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -976,7 +972,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -1321,8 +1317,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
@ -1332,7 +1328,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
@ -1343,13 +1339,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1361,13 +1357,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
@ -1384,7 +1380,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1396,7 +1392,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
@ -1437,7 +1433,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1490,7 +1486,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1543,7 +1539,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1596,7 +1592,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1649,7 +1645,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1702,7 +1698,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1804,8 +1800,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
@ -1815,7 +1811,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
@ -1827,13 +1823,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1845,13 +1841,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
@ -1869,7 +1865,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1881,7 +1877,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
@ -1922,7 +1918,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1975,7 +1971,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2028,7 +2024,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2081,7 +2077,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2134,7 +2130,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2187,7 +2183,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2352,5 +2348,5 @@ struct MDFunctor< MDRange, Functor, void >
|
||||
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -195,3 +195,4 @@ struct PolicyTraits
|
||||
|
||||
|
||||
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,10 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
|
||||
#define KOKKOS_ATOMIC_ASSEMBLY_HPP
|
||||
namespace Kokkos {
|
||||
@ -110,3 +112,4 @@ namespace Impl {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
|
||||
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_DECREMENT_HPP
|
||||
|
||||
@ -117,3 +118,4 @@ void atomic_decrement(volatile T* a) {
|
||||
|
||||
} // End of namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
|
||||
#define KOKKOS_ATOMIC_EXCHANGE_HPP
|
||||
|
||||
@ -356,5 +357,3 @@ void atomic_assign( volatile T * const dest , const T val )
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
|
||||
|
||||
@ -279,6 +280,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
{
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
|
||||
// Don't use the following line of code here:
|
||||
//
|
||||
//const T tmp = *dest = return_val + val;
|
||||
@ -292,6 +294,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
const T tmp = *dest;
|
||||
(void) tmp;
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
|
||||
return return_val;
|
||||
}
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_AND_HPP
|
||||
|
||||
@ -124,4 +125,3 @@ void atomic_and(volatile T * const dest, const T src) {
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_OR_HPP
|
||||
|
||||
@ -124,4 +125,3 @@ void atomic_or(volatile T * const dest, const T src) {
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
|
||||
|
||||
@ -238,4 +239,3 @@ void atomic_sub(volatile T * const dest, const T src) {
|
||||
#include<impl/Kokkos_Atomic_Assembly.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,10 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
|
||||
#define KOKKOS_ATOMIC_GENERIC_HPP
|
||||
#include <Kokkos_Macros.hpp>
|
||||
@ -424,6 +426,6 @@ T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
|
||||
return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_INCREMENT_HPP
|
||||
|
||||
@ -115,3 +116,4 @@ void atomic_increment(volatile T* a) {
|
||||
|
||||
} // End of namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -428,3 +428,4 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,12 +36,13 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
|
||||
#define KOKKOS_ATOMIC_WINDOWS_HPP
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
@ -45,12 +45,38 @@
|
||||
#define KOKKOS_BITOPS_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <stdint.h>
|
||||
#include <cstdint>
|
||||
#include <climits>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Find first zero bit.
|
||||
*
|
||||
* If none then return -1 ;
|
||||
*/
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_first_zero( unsigned i ) noexcept
|
||||
{
|
||||
enum : unsigned { full = ~0u };
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return full != i ? __ffs( ~i ) - 1 : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return full != i ? _bit_scan_forward( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return full != i ? __cnttz4( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
|
||||
#else
|
||||
int offset = -1 ;
|
||||
if ( full != i ) {
|
||||
for ( offset = 0 ; i & ( 1 << offset ) ; ++offset );
|
||||
}
|
||||
return offset ;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_scan_forward( unsigned i )
|
||||
{
|
||||
@ -121,7 +147,16 @@ int bit_count( unsigned i )
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned integral_power_of_two_that_contains( const unsigned N )
|
||||
{
|
||||
const unsigned i = Kokkos::Impl::bit_scan_reverse( N );
|
||||
return ( (1u << i) < N ) ? i + 1 : i ;
|
||||
}
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_BITOPS_HPP
|
||||
|
||||
|
||||
@ -122,3 +122,4 @@ int mpi_local_rank_on_node() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
106
lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
Normal file
106
lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CLOCKTIC_HPP
|
||||
#define KOKKOS_CLOCKTIC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <stdint.h>
|
||||
#include <chrono>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Quick query of clock register tics
|
||||
*
|
||||
* Primary use case is to, with low overhead,
|
||||
* obtain a integral value that consistently varies
|
||||
* across concurrent threads of execution within
|
||||
* a parallel algorithm.
|
||||
* This value is often used to "randomly" seed an
|
||||
* attempt to acquire an indexed resource (e.g., bit)
|
||||
* from an array of resources (e.g., bitset) such that
|
||||
* concurrent threads will have high likelihood of
|
||||
* having different index-seed values.
|
||||
*/
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
uint64_t clock_tic(void) noexcept
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
// Return value of 64-bit hi-res clock register.
|
||||
|
||||
return clock64();
|
||||
|
||||
#elif defined( __i386__ ) || defined( __x86_64 )
|
||||
|
||||
// Return value of 64-bit hi-res clock register.
|
||||
|
||||
unsigned a = 0, d = 0;
|
||||
|
||||
__asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
|
||||
|
||||
return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
|
||||
|
||||
#elif defined( __powerpc ) || defined( __powerpc__ ) || \
|
||||
defined( __powerpc64__ ) || defined( __POWERPC__ ) || \
|
||||
defined( __ppc__ ) || defined( __ppc64__ )
|
||||
|
||||
unsigned int cycles = 0;
|
||||
|
||||
asm volatile( "mftb %0" : "=r" (cycles) );
|
||||
|
||||
return (uint64_t) cycles;
|
||||
|
||||
#else
|
||||
|
||||
return (uint64_t)
|
||||
std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_CLOCKTIC_HPP
|
||||
357
lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
Normal file
357
lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp
Normal file
@ -0,0 +1,357 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CONCURRENTBITSET_HPP
|
||||
#define KOKKOS_CONCURRENTBITSET_HPP
|
||||
|
||||
#include <stdint.h>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
#include <impl/Kokkos_ClockTic.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct concurrent_bitset {
|
||||
public:
|
||||
|
||||
// 32 bits per integer value
|
||||
|
||||
enum : uint32_t { bits_per_int_lg2 = 5 };
|
||||
enum : uint32_t { bits_per_int_mask = ( 1 << bits_per_int_lg2 ) - 1 };
|
||||
|
||||
// Buffer is uint32_t[ buffer_bound ]
|
||||
// [ uint32_t { state_header | used_count } , uint32_t bits[*] ]
|
||||
//
|
||||
// Maximum bit count is 33 million (1u<<25):
|
||||
//
|
||||
// - Maximum bit set size occupies 1 Mbyte
|
||||
//
|
||||
// - State header can occupy bits [30-26]
|
||||
// which can be the bit_count_lg2
|
||||
//
|
||||
// - Accept at least 33 million concurrent calls to 'acquire'
|
||||
// before risking an overflow race condition on a full bitset.
|
||||
|
||||
enum : uint32_t { max_bit_count_lg2 = 25 };
|
||||
enum : uint32_t { max_bit_count = 1u << max_bit_count_lg2 };
|
||||
enum : uint32_t { state_shift = 26 };
|
||||
enum : uint32_t { state_used_mask = ( 1 << state_shift ) - 1 };
|
||||
enum : uint32_t { state_header_mask = uint32_t(0x001f) << state_shift };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static constexpr
|
||||
uint32_t buffer_bound_lg2( uint32_t const bit_bound_lg2 ) noexcept
|
||||
{
|
||||
return bit_bound_lg2 <= max_bit_count_lg2
|
||||
? 1 + ( 1u << ( bit_bound_lg2 > bits_per_int_lg2
|
||||
? bit_bound_lg2 - bits_per_int_lg2 : 0 ) )
|
||||
: 0 ;
|
||||
}
|
||||
|
||||
/**\brief Initialize bitset buffer */
|
||||
KOKKOS_INLINE_FUNCTION static constexpr
|
||||
uint32_t buffer_bound( uint32_t const bit_bound ) noexcept
|
||||
{
|
||||
return bit_bound <= max_bit_count
|
||||
? 1 + ( bit_bound >> bits_per_int_lg2 ) +
|
||||
( bit_bound & bits_per_int_mask ? 1 : 0 )
|
||||
: 0 ;
|
||||
}
|
||||
|
||||
/**\brief Claim any bit within the bitset bound.
|
||||
*
|
||||
* Return : ( which_bit , bit_count )
|
||||
*
|
||||
* if success then
|
||||
* bit_count is the atomic-count of claimed > 0
|
||||
* which_bit is the claimed bit >= 0
|
||||
* else if attempt failed due to filled buffer
|
||||
* bit_count == which_bit == -1
|
||||
* else if attempt failed due to non-matching state_header
|
||||
* bit_count == which_bit == -2
|
||||
* else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
|
||||
* or invalid state_header
|
||||
* or (1u << bit_bound_lg2) <= bit
|
||||
* bit_count == which_bit == -3
|
||||
* endif
|
||||
*
|
||||
* Recommended to have hint
|
||||
* bit = Kokkos::Impl::clock_tic() & ((1u<<bit_bound_lg2) - 1)
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
Kokkos::pair<int,int>
|
||||
acquire_bounded_lg2( uint32_t volatile * const buffer
|
||||
, uint32_t const bit_bound_lg2
|
||||
, uint32_t bit = 0 /* optional hint */
|
||||
, uint32_t const state_header = 0 /* optional header */
|
||||
) noexcept
|
||||
{
|
||||
typedef Kokkos::pair<int,int> type ;
|
||||
|
||||
const uint32_t bit_bound = 1 << bit_bound_lg2 ;
|
||||
const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
|
||||
|
||||
if ( ( max_bit_count_lg2 < bit_bound_lg2 ) ||
|
||||
( state_header & ~state_header_mask ) ||
|
||||
( bit_bound < bit ) ) {
|
||||
return type(-3,-3);
|
||||
}
|
||||
|
||||
// Use potentially two fetch_add to avoid CAS loop.
|
||||
// Could generate "racing" failure-to-acquire
|
||||
// when is full at the atomic_fetch_add(+1)
|
||||
// then a release occurs before the atomic_fetch_add(-1).
|
||||
|
||||
const uint32_t state = (uint32_t)
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
|
||||
|
||||
const uint32_t state_error =
|
||||
state_header != ( state & state_header_mask );
|
||||
|
||||
const uint32_t state_bit_used = state & state_used_mask ;
|
||||
|
||||
if ( state_error || ( bit_bound <= state_bit_used ) ) {
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
|
||||
return state_error ? type(-2,-2) : type(-1,-1);
|
||||
}
|
||||
|
||||
// Do not update bit until count is visible:
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// There is a zero bit available somewhere,
|
||||
// now find the (first) available bit and set it.
|
||||
|
||||
while(1) {
|
||||
|
||||
const uint32_t word = bit >> bits_per_int_lg2 ;
|
||||
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
|
||||
const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
|
||||
|
||||
if ( ! ( prev & mask ) ) {
|
||||
// Successfully claimed 'result.first' by
|
||||
// atomically setting that bit.
|
||||
return type( bit , state_bit_used + 1 );
|
||||
}
|
||||
|
||||
// Failed race to set the selected bit
|
||||
// Find a new bit to try.
|
||||
|
||||
const int j = Kokkos::Impl::bit_first_zero( prev );
|
||||
|
||||
if ( 0 <= j ) {
|
||||
bit = ( word << bits_per_int_lg2 ) | uint32_t(j);
|
||||
}
|
||||
else {
|
||||
bit =
|
||||
( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
|
||||
| ( bit & bits_per_int_mask );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**\brief Claim any bit within the bitset bound.
|
||||
*
|
||||
* Return : ( which_bit , bit_count )
|
||||
*
|
||||
* if success then
|
||||
* bit_count is the atomic-count of claimed > 0
|
||||
* which_bit is the claimed bit >= 0
|
||||
* else if attempt failed due to filled buffer
|
||||
* bit_count == which_bit == -1
|
||||
* else if attempt failed due to non-matching state_header
|
||||
* bit_count == which_bit == -2
|
||||
* else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
|
||||
* or invalid state_header
|
||||
* or bit_bound <= bit
|
||||
* bit_count == which_bit == -3
|
||||
* endif
|
||||
*
|
||||
* Recommended to have hint
|
||||
* bit = Kokkos::Impl::clock_tic() % bit_bound
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
Kokkos::pair<int,int>
|
||||
acquire_bounded( uint32_t volatile * const buffer
|
||||
, uint32_t const bit_bound
|
||||
, uint32_t bit = 0 /* optional hint */
|
||||
, uint32_t const state_header = 0 /* optional header */
|
||||
) noexcept
|
||||
{
|
||||
typedef Kokkos::pair<int,int> type ;
|
||||
|
||||
if ( ( max_bit_count < bit_bound ) ||
|
||||
( state_header & ~state_header_mask ) ||
|
||||
( bit_bound <= bit ) ) {
|
||||
return type(-3,-3);
|
||||
}
|
||||
|
||||
const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
|
||||
|
||||
// Use potentially two fetch_add to avoid CAS loop.
|
||||
// Could generate "racing" failure-to-acquire
|
||||
// when is full at the atomic_fetch_add(+1)
|
||||
// then a release occurs before the atomic_fetch_add(-1).
|
||||
|
||||
const uint32_t state = (uint32_t)
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
|
||||
|
||||
const uint32_t state_error =
|
||||
state_header != ( state & state_header_mask );
|
||||
|
||||
const uint32_t state_bit_used = state & state_used_mask ;
|
||||
|
||||
if ( state_error || ( bit_bound <= state_bit_used ) ) {
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
|
||||
return state_error ? type(-2,-2) : type(-1,-1);
|
||||
}
|
||||
|
||||
// Do not update bit until count is visible:
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// There is a zero bit available somewhere,
|
||||
// now find the (first) available bit and set it.
|
||||
|
||||
while(1) {
|
||||
|
||||
const uint32_t word = bit >> bits_per_int_lg2 ;
|
||||
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
|
||||
const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
|
||||
|
||||
if ( ! ( prev & mask ) ) {
|
||||
// Successfully claimed 'result.first' by
|
||||
// atomically setting that bit.
|
||||
return type( bit , state_bit_used + 1 );
|
||||
}
|
||||
|
||||
// Failed race to set the selected bit
|
||||
// Find a new bit to try.
|
||||
|
||||
const int j = Kokkos::Impl::bit_first_zero( prev );
|
||||
|
||||
if ( 0 <= j ) {
|
||||
bit = (word << bits_per_int_lg2 ) | uint32_t(j);
|
||||
}
|
||||
|
||||
if ( ( j < 0 ) || ( bit_bound <= bit ) ) {
|
||||
bit =
|
||||
( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
|
||||
| ( bit & bits_per_int_mask );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**\brief
|
||||
*
|
||||
* Requires: 'bit' previously acquired and has not yet been released.
|
||||
*
|
||||
* Returns:
|
||||
* 0 <= used count after successful release
|
||||
* -1 bit was already released
|
||||
* -2 state_header error
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
int release( uint32_t volatile * const buffer
|
||||
, uint32_t const bit
|
||||
, uint32_t const state_header = 0 /* optional header */
|
||||
) noexcept
|
||||
{
|
||||
if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
|
||||
|
||||
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
|
||||
const uint32_t prev =
|
||||
Kokkos::atomic_fetch_and( buffer + ( bit >> bits_per_int_lg2 ) + 1
|
||||
, ~mask
|
||||
);
|
||||
|
||||
if ( ! ( prev & mask ) ) { return -1 ; }
|
||||
|
||||
// Do not update count until bit clear is visible
|
||||
Kokkos::memory_fence();
|
||||
|
||||
const int count =
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
|
||||
|
||||
return ( count & state_used_mask ) - 1 ;
|
||||
}
|
||||
|
||||
/**\brief
|
||||
*
|
||||
* Requires: Bit within bounds and not already set.
|
||||
*
|
||||
* Returns:
|
||||
* 0 <= used count after successful release
|
||||
* -1 bit was already released
|
||||
* -2 bit or state_header error
|
||||
*/
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
int set( uint32_t volatile * const buffer
|
||||
, uint32_t const bit
|
||||
, uint32_t const state_header = 0 /* optional header */
|
||||
) noexcept
|
||||
{
|
||||
if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
|
||||
|
||||
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
|
||||
const uint32_t prev =
|
||||
Kokkos::atomic_fetch_or( buffer + ( bit >> bits_per_int_lg2 ) + 1
|
||||
, mask
|
||||
);
|
||||
|
||||
if ( ! ( prev & mask ) ) { return -1 ; }
|
||||
|
||||
// Do not update count until bit clear is visible
|
||||
Kokkos::memory_fence();
|
||||
|
||||
const int count =
|
||||
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
|
||||
|
||||
return ( count & state_used_mask ) - 1 ;
|
||||
}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #ifndef KOKKOS_CONCURRENTBITSET_HPP */
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
#include <cctype>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <cstdlib>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -75,10 +76,10 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
#endif
|
||||
|
||||
// Protect declarations, to prevent "unused variable" warnings.
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
const int num_threads = args.num_threads;
|
||||
const int use_numa = args.num_numa;
|
||||
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
const int use_gpu = args.device_id;
|
||||
#endif // defined( KOKKOS_ENABLE_CUDA )
|
||||
@ -103,7 +104,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
|
||||
if(num_threads>0) {
|
||||
@ -135,6 +136,25 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
if( Impl::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value ) {
|
||||
if(num_threads>0) {
|
||||
if(use_numa>0) {
|
||||
Kokkos::Experimental::OpenMPTarget::initialize(num_threads,use_numa);
|
||||
}
|
||||
else {
|
||||
Kokkos::Experimental::OpenMPTarget::initialize(num_threads);
|
||||
}
|
||||
} else {
|
||||
Kokkos::Experimental::OpenMPTarget::initialize();
|
||||
}
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
|
||||
}
|
||||
else {
|
||||
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
|
||||
if (use_gpu > -1) {
|
||||
@ -166,6 +186,13 @@ void finalize_internal( const bool all_spaces = false )
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
|
||||
if(Kokkos::Experimental::OpenMPTarget::is_initialized())
|
||||
Kokkos::Experimental::OpenMPTarget::finalize();
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
|
||||
@ -175,7 +202,7 @@ void finalize_internal( const bool all_spaces = false )
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
|
||||
all_spaces ) {
|
||||
@ -210,7 +237,7 @@ void fence_internal()
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
|
||||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
|
||||
Kokkos::Threads::fence();
|
||||
@ -226,6 +253,39 @@ void fence_internal()
|
||||
|
||||
}
|
||||
|
||||
bool check_arg(char const* arg, char const* expected) {
|
||||
std::size_t arg_len = std::strlen(arg);
|
||||
std::size_t exp_len = std::strlen(expected);
|
||||
if (arg_len < exp_len) return false;
|
||||
if (std::strncmp(arg, expected, exp_len) != 0) return false;
|
||||
if (arg_len == exp_len) return true;
|
||||
/* if expected is "--threads", ignore "--threads-for-application"
|
||||
by checking this character ---------^
|
||||
to see if it continues to make a longer name */
|
||||
if (std::isalnum(arg[exp_len]) || arg[exp_len] == '-' || arg[exp_len] == '_') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool check_int_arg(char const* arg, char const* expected, int* value) {
|
||||
if (!check_arg(arg, expected)) return false;
|
||||
std::size_t arg_len = std::strlen(arg);
|
||||
std::size_t exp_len = std::strlen(expected);
|
||||
bool okay = true;
|
||||
if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
|
||||
char const* number = arg + exp_len + 1;
|
||||
if (!Impl::is_unsigned_int(number) || strlen(number) == 0) okay = false;
|
||||
*value = std::atoi(number);
|
||||
if (!okay) {
|
||||
std::ostringstream ss;
|
||||
ss << "Error: expecting an '=INT' after command line argument '" << expected << "'";
|
||||
ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
|
||||
Impl::throw_runtime_exception( ss.str() );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
@ -248,76 +308,31 @@ void initialize(int& narg, char* arg[])
|
||||
int iarg = 0;
|
||||
|
||||
while (iarg < narg) {
|
||||
if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
|
||||
//Find the number of threads (expecting --threads=XX)
|
||||
if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
char* number = strchr(arg[iarg],'=')+1;
|
||||
|
||||
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
|
||||
num_threads = atoi(number);
|
||||
|
||||
//Remove the --kokkos-threads argument from the list but leave --threads
|
||||
if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
kokkos_threads_found=1;
|
||||
narg--;
|
||||
} else {
|
||||
iarg++;
|
||||
if (Impl::check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
} else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
|
||||
//Find the number of numa (expecting --numa=XX)
|
||||
if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
char* number = strchr(arg[iarg],'=')+1;
|
||||
|
||||
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
|
||||
numa = atoi(number);
|
||||
|
||||
//Remove the --kokkos-numa argument from the list but leave --numa
|
||||
if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
kokkos_numa_found=1;
|
||||
narg--;
|
||||
} else {
|
||||
iarg++;
|
||||
kokkos_threads_found=1;
|
||||
narg--;
|
||||
} else if (!kokkos_threads_found && Impl::check_int_arg(arg[iarg], "--threads", &num_threads)) {
|
||||
iarg++;
|
||||
} else if (Impl::check_int_arg(arg[iarg], "--kokkos-numa", &numa)) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
} else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
|
||||
//Find the number of device (expecting --device=XX)
|
||||
if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
char* number = strchr(arg[iarg],'=')+1;
|
||||
|
||||
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
|
||||
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
|
||||
|
||||
if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
|
||||
device = atoi(number);
|
||||
|
||||
//Remove the --kokkos-device argument from the list but leave --device
|
||||
if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
kokkos_device_found=1;
|
||||
narg--;
|
||||
} else {
|
||||
iarg++;
|
||||
kokkos_numa_found=1;
|
||||
narg--;
|
||||
} else if (!kokkos_numa_found && Impl::check_int_arg(arg[iarg], "--numa", &numa)) {
|
||||
iarg++;
|
||||
} else if (Impl::check_int_arg(arg[iarg], "--kokkos-device", &device)) {
|
||||
for(int k=iarg;k<narg-1;k++) {
|
||||
arg[k] = arg[k+1];
|
||||
}
|
||||
} else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
|
||||
kokkos_device_found=1;
|
||||
narg--;
|
||||
} else if (!kokkos_device_found && Impl::check_int_arg(arg[iarg], "--device", &device)) {
|
||||
iarg++;
|
||||
} else if (Impl::check_arg(arg[iarg], "--kokkos-ndevices") || Impl::check_arg(arg[iarg], "--ndevices")) {
|
||||
|
||||
//Find the number of device (expecting --device=XX)
|
||||
if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
|
||||
@ -511,20 +526,8 @@ void print_configuration( std::ostream & out , const bool detail )
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_PTHREAD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_STDTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_STDTHREAD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_WINTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_WINTHREAD
|
||||
msg << " KOKKOS_ENABLE_THREADS: ";
|
||||
#ifdef KOKKOS_ENABLE_THREADS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
@ -754,7 +757,7 @@ void print_configuration( std::ostream & out , const bool detail )
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
OpenMP::print_configuration(msg, detail);
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
Threads::print_configuration(msg, detail);
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,14 +36,14 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <ostream>
|
||||
#include <sstream>
|
||||
@ -127,7 +127,8 @@ std::string human_memory_size(size_t arg_bytes)
|
||||
#include <execinfo.h>
|
||||
#include <cxxabi.h>
|
||||
#include <dlfcn.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
@ -75,7 +75,9 @@ void abort( const char * const message ) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
Kokkos::Impl::cuda_abort(message);
|
||||
#else
|
||||
Kokkos::Impl::host_abort(message);
|
||||
#ifndef KOKKOS_ENABLE_OPENMPTARGET
|
||||
Kokkos::Impl::host_abort(message);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -1,3 +1,46 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
@ -17,3 +60,4 @@ Impl::PerThreadValue PerThread(const int& arg)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -55,6 +55,46 @@
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType, class Enable = void>
|
||||
struct ReduceFunctorHasInit {
|
||||
enum {value = false};
|
||||
};
|
||||
|
||||
template< class FunctorType>
|
||||
struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
|
||||
enum {value = true};
|
||||
};
|
||||
|
||||
template< class FunctorType, class Enable = void>
|
||||
struct ReduceFunctorHasJoin {
|
||||
enum {value = false};
|
||||
};
|
||||
|
||||
template< class FunctorType>
|
||||
struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
|
||||
enum {value = true};
|
||||
};
|
||||
|
||||
template< class FunctorType, class Enable = void>
|
||||
struct ReduceFunctorHasFinal {
|
||||
enum {value = false};
|
||||
};
|
||||
|
||||
template< class FunctorType>
|
||||
struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
|
||||
enum {value = true};
|
||||
};
|
||||
|
||||
template< class FunctorType, class Enable = void>
|
||||
struct ReduceFunctorHasShmemSize {
|
||||
enum {value = false};
|
||||
};
|
||||
|
||||
template< class FunctorType>
|
||||
struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
|
||||
enum {value = true};
|
||||
};
|
||||
|
||||
template< class FunctorType , class ArgTag , class Enable = void >
|
||||
struct FunctorDeclaresValueType : public Impl::false_type {};
|
||||
|
||||
@ -63,6 +103,21 @@ struct FunctorDeclaresValueType< FunctorType , ArgTag
|
||||
, typename Impl::enable_if_type< typename FunctorType::value_type >::type >
|
||||
: public Impl::true_type {};
|
||||
|
||||
template< class FunctorType, bool Enable =
|
||||
( FunctorDeclaresValueType<FunctorType,void>::value) ||
|
||||
( ReduceFunctorHasInit<FunctorType>::value ) ||
|
||||
( ReduceFunctorHasJoin<FunctorType>::value ) ||
|
||||
( ReduceFunctorHasFinal<FunctorType>::value ) ||
|
||||
( ReduceFunctorHasShmemSize<FunctorType>::value )
|
||||
>
|
||||
struct IsNonTrivialReduceFunctor {
|
||||
enum {value = false};
|
||||
};
|
||||
|
||||
template< class FunctorType>
|
||||
struct IsNonTrivialReduceFunctor<FunctorType, true> {
|
||||
enum {value = true};
|
||||
};
|
||||
|
||||
/** \brief Query Functor and execution policy argument tag for value type.
|
||||
*
|
||||
|
||||
@ -48,7 +48,6 @@
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_Reducer.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -101,6 +100,29 @@ private:
|
||||
using Tag = typename has_work_tag<>::type ;
|
||||
using WTag = typename has_work_tag<>::wtag ;
|
||||
|
||||
//----------------------------------------
|
||||
// Check for T::execution_space
|
||||
|
||||
template< typename T , typename = std::false_type >
|
||||
struct has_execution_space { using type = void ; enum { value = false }; };
|
||||
|
||||
template< typename T >
|
||||
struct has_execution_space
|
||||
< T , typename std::is_same< typename T::execution_space , void >::type >
|
||||
{
|
||||
using type = typename T::execution_space ;
|
||||
enum { value = true };
|
||||
};
|
||||
|
||||
using policy_has_space = has_execution_space< Policy > ;
|
||||
using functor_has_space = has_execution_space< Functor > ;
|
||||
|
||||
static_assert( ! policy_has_space::value ||
|
||||
! functor_has_space::value ||
|
||||
std::is_same< typename policy_has_space::type
|
||||
, typename functor_has_space::type >::value
|
||||
, "Execution Policy and Functor execution space must match" );
|
||||
|
||||
//----------------------------------------
|
||||
// Check for Functor::value_type, which is either a simple type T or T[]
|
||||
|
||||
@ -190,6 +212,16 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
using execution_space = typename std::conditional
|
||||
< functor_has_space::value
|
||||
, typename functor_has_space::type
|
||||
, typename std::conditional
|
||||
< policy_has_space::value
|
||||
, typename policy_has_space::type
|
||||
, Kokkos::DefaultExecutionSpace
|
||||
>::type
|
||||
>::type ;
|
||||
|
||||
using value_type = typename std::remove_extent< candidate_type >::type ;
|
||||
|
||||
static_assert( ! std::is_const< value_type >::value
|
||||
@ -214,14 +246,14 @@ public:
|
||||
private:
|
||||
|
||||
template< bool IsArray , class FF >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
KOKKOS_INLINE_FUNCTION static constexpr
|
||||
typename std::enable_if< IsArray , unsigned >::type
|
||||
get_length( FF const & f ) { return f.value_count ; }
|
||||
|
||||
template< bool IsArray , class FF >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
KOKKOS_INLINE_FUNCTION static constexpr
|
||||
typename std::enable_if< ! IsArray , unsigned >::type
|
||||
get_length( FF const & ) { return 1 ; }
|
||||
get_length( FF const & ) { return candidate_is_void ? 0 : 1 ; }
|
||||
|
||||
public:
|
||||
|
||||
@ -229,25 +261,25 @@ public:
|
||||
! candidate_is_array
|
||||
? sizeof(ValueType) : 0 };
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
KOKKOS_FORCEINLINE_FUNCTION static constexpr
|
||||
unsigned value_count( const Functor & f )
|
||||
{ return FunctorAnalysis::template get_length< candidate_is_array >(f); }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
KOKKOS_FORCEINLINE_FUNCTION static constexpr
|
||||
unsigned value_size( const Functor & f )
|
||||
{ return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class Unknown >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
KOKKOS_FORCEINLINE_FUNCTION static constexpr
|
||||
unsigned value_count( const Unknown & )
|
||||
{ return 1 ; }
|
||||
{ return candidate_is_void ? 0 : 1 ; }
|
||||
|
||||
template< class Unknown >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
KOKKOS_FORCEINLINE_FUNCTION static constexpr
|
||||
unsigned value_size( const Unknown & )
|
||||
{ return sizeof(ValueType); }
|
||||
{ return candidate_is_void ? 0 : sizeof(ValueType); }
|
||||
|
||||
private:
|
||||
|
||||
@ -277,16 +309,16 @@ private:
|
||||
typedef volatile const ValueType & cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
void join( F const * const f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( *dst , *src ); }
|
||||
{ f->join( *dst , *src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
@ -296,16 +328,16 @@ private:
|
||||
typedef volatile const ValueType * cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
void join( F const * const f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( dst , src ); }
|
||||
{ f->join( dst , src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
@ -315,22 +347,22 @@ private:
|
||||
typedef volatile const ValueType & cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
void join( F const * const f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( WTag() , *dst , *src ); }
|
||||
{ f->join( WTag() , *dst , *src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
@ -340,22 +372,22 @@ private:
|
||||
typedef volatile const ValueType * cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
void join( F const * const f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( WTag() , dst , src ); }
|
||||
{ f->join( WTag() , dst , src ); }
|
||||
};
|
||||
|
||||
|
||||
@ -364,12 +396,14 @@ private:
|
||||
, typename = void >
|
||||
struct DeduceJoin
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
void join( F const * const f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{
|
||||
const int n = FunctorAnalysis::value_count( f );
|
||||
const int n = FunctorAnalysis::value_count( *f );
|
||||
for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
|
||||
}
|
||||
};
|
||||
@ -377,8 +411,10 @@ private:
|
||||
template< class F >
|
||||
struct DeduceJoin< F , DISABLE , void >
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const &
|
||||
void join( F const * const
|
||||
, ValueType volatile *
|
||||
, ValueType volatile const * ) {}
|
||||
};
|
||||
@ -386,7 +422,8 @@ private:
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceJoin< F , I ,
|
||||
decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
|
||||
: public has_join_function<F,I> {};
|
||||
: public has_join_function<F,I>
|
||||
{ enum { value = true }; };
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
@ -397,38 +434,38 @@ private:
|
||||
struct has_init_function< F , NO_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType & ) );
|
||||
void enable_if( void (F::*)( ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( *dst ); }
|
||||
void init( F const * const f , ValueType * dst )
|
||||
{ f->init( *dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , NO_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType * ) );
|
||||
void enable_if( void (F::*)( ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( dst ); }
|
||||
void init( F const * const f , ValueType * dst )
|
||||
{ f->init( dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , HAS_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) );
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) );
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType & ) );
|
||||
@ -437,18 +474,18 @@ private:
|
||||
void enable_if( void (*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( WTag(), *dst ); }
|
||||
void init( F const * const f , ValueType * dst )
|
||||
{ f->init( WTag(), *dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , HAS_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) );
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) );
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType * ) );
|
||||
@ -457,8 +494,8 @@ private:
|
||||
void enable_if( void (*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( WTag(), dst ); }
|
||||
void init( F const * const f , ValueType * dst )
|
||||
{ f->init( WTag(), dst ); }
|
||||
};
|
||||
|
||||
template< class F = Functor
|
||||
@ -466,87 +503,29 @@ private:
|
||||
, typename = void >
|
||||
struct DeduceInit
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
|
||||
void init( F const * const , ValueType * dst ) { new(dst) ValueType(); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct DeduceInit< F , DISABLE , void >
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & , ValueType * ) {}
|
||||
void init( F const * const , ValueType * ) {}
|
||||
};
|
||||
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceInit< F , I ,
|
||||
decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
|
||||
: public has_init_function<F,I> {};
|
||||
: public has_init_function<F,I>
|
||||
{ enum { value = true }; };
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
struct Reducer
|
||||
{
|
||||
private:
|
||||
|
||||
Functor const & m_functor ;
|
||||
ValueType * const m_result ;
|
||||
int const m_length ;
|
||||
|
||||
public:
|
||||
|
||||
using reducer = Reducer ;
|
||||
using value_type = FunctorAnalysis::value_type ;
|
||||
using memory_space = void ;
|
||||
using reference_type = FunctorAnalysis::reference_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join( ValueType volatile * dst
|
||||
, ValueType volatile const * src ) const noexcept
|
||||
{ DeduceJoin<>::join( m_functor , dst , src ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void init( ValueType * dst ) const noexcept
|
||||
{ DeduceInit<>::init( m_functor , dst ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
constexpr Reducer( Functor const & arg_functor
|
||||
, ValueType * arg_value = 0
|
||||
, int arg_length = 0 ) noexcept
|
||||
: m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr int length() const noexcept { return m_length ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType & operator[]( int i ) const noexcept
|
||||
{ return m_result[i]; }
|
||||
|
||||
private:
|
||||
|
||||
template< bool IsArray >
|
||||
constexpr
|
||||
typename std::enable_if< IsArray , ValueType * >::type
|
||||
ref() const noexcept { return m_result ; }
|
||||
|
||||
template< bool IsArray >
|
||||
constexpr
|
||||
typename std::enable_if< ! IsArray , ValueType & >::type
|
||||
ref() const noexcept { return *m_result ; }
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
auto result() const noexcept
|
||||
-> decltype( Reducer::template ref< candidate_is_array >() )
|
||||
{ return Reducer::template ref< candidate_is_array >(); }
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
template< class , INTERFACE >
|
||||
struct has_final_function ;
|
||||
|
||||
@ -555,14 +534,14 @@ private:
|
||||
struct has_final_function< F , NO_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType & ) );
|
||||
void enable_if( void (F::*)( ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( *dst ); }
|
||||
void final( F const * const f , ValueType * dst )
|
||||
{ f->final( *dst ); }
|
||||
};
|
||||
|
||||
// No tag, is array
|
||||
@ -570,14 +549,14 @@ private:
|
||||
struct has_final_function< F , NO_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType * ) );
|
||||
void enable_if( void (F::*)( ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( dst ); }
|
||||
void final( F const * const f , ValueType * dst )
|
||||
{ f->final( dst ); }
|
||||
};
|
||||
|
||||
// Has tag, not array
|
||||
@ -585,10 +564,10 @@ private:
|
||||
struct has_final_function< F , HAS_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) );
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) );
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType & ) );
|
||||
@ -597,8 +576,8 @@ private:
|
||||
void enable_if( void (*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( WTag(), *dst ); }
|
||||
void final( F const * const f , ValueType * dst )
|
||||
{ f->final( WTag(), *dst ); }
|
||||
};
|
||||
|
||||
// Has tag, is array
|
||||
@ -606,10 +585,10 @@ private:
|
||||
struct has_final_function< F , HAS_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) );
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) );
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) const );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType * ) );
|
||||
@ -618,8 +597,8 @@ private:
|
||||
void enable_if( void (*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( WTag(), dst ); }
|
||||
void final( F const * const f , ValueType * dst )
|
||||
{ f->final( WTag(), dst ); }
|
||||
};
|
||||
|
||||
template< class F = Functor
|
||||
@ -627,20 +606,139 @@ private:
|
||||
, typename = void >
|
||||
struct DeduceFinal
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void final( F const & , ValueType * ) {}
|
||||
static void final( F const * const , ValueType * ) {}
|
||||
};
|
||||
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceFinal< F , I ,
|
||||
decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
|
||||
: public has_init_function<F,I> {};
|
||||
: public has_final_function<F,I>
|
||||
{ enum { value = true }; };
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class F = Functor , typename = void >
|
||||
struct DeduceTeamShmem
|
||||
{
|
||||
enum { value = false };
|
||||
|
||||
static size_t team_shmem_size( F const & , int ) { return 0 ; }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::team_shmem_size ) >::type >
|
||||
{
|
||||
enum { value = true };
|
||||
|
||||
static size_t team_shmem_size( F const * const f , int team_size )
|
||||
{ return f->team_shmem_size( team_size ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::shmem_size ) >::type >
|
||||
{
|
||||
enum { value = true };
|
||||
|
||||
static size_t team_shmem_size( F const * const f , int team_size )
|
||||
{ return f->shmem_size( team_size ); }
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
static void final( Functor const & f , ValueType * result )
|
||||
{ DeduceFinal<>::final( f , result ); }
|
||||
inline static
|
||||
size_t team_shmem_size( Functor const & f )
|
||||
{ return DeduceTeamShmem<>::team_shmem_size( f ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
enum { has_join_member_function = DeduceJoin<>::value };
|
||||
enum { has_init_member_function = DeduceInit<>::value };
|
||||
enum { has_final_member_function = DeduceFinal<>::value };
|
||||
|
||||
|
||||
template< class MemorySpace = typename execution_space::memory_space >
|
||||
struct Reducer
|
||||
{
|
||||
private:
|
||||
|
||||
Functor const * const m_functor ;
|
||||
ValueType * const m_result ;
|
||||
|
||||
template< bool IsArray >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< IsArray , FunctorAnalysis::ValueType * >::type
|
||||
ref() const noexcept { return m_result ; }
|
||||
|
||||
template< bool IsArray >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< ! IsArray , FunctorAnalysis::ValueType & >::type
|
||||
ref() const noexcept { return *m_result ; }
|
||||
|
||||
template< bool IsArray >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< IsArray , int >::type
|
||||
len() const noexcept { return m_functor->value_count ; }
|
||||
|
||||
template< bool IsArray >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
typename std::enable_if< ! IsArray , int >::type
|
||||
len() const noexcept { return candidate_is_void ? 0 : 1 ; }
|
||||
|
||||
public:
|
||||
|
||||
using reducer = Reducer ;
|
||||
using value_type = FunctorAnalysis::value_type ;
|
||||
using memory_space = MemorySpace ;
|
||||
using reference_type = FunctorAnalysis::reference_type ;
|
||||
using functor_type = Functor ; // Adapts a functor
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
value_type * data() const noexcept { return m_result ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
reference_type reference() const noexcept
|
||||
{ return Reducer::template ref< candidate_is_array >(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
int length() const noexcept
|
||||
{ return Reducer::template len< candidate_is_array >(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void copy( ValueType * const dst
|
||||
, ValueType const * const src ) const noexcept
|
||||
{ for ( int i = 0 ; i < Reducer::template len< candidate_is_array >() ; ++i ) dst[i] = src[i] ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join( ValueType volatile * dst
|
||||
, ValueType volatile const * src ) const noexcept
|
||||
{ DeduceJoin<>::join( m_functor , dst , src ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void init( ValueType * dst ) const noexcept
|
||||
{ DeduceInit<>::init( m_functor , dst ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void final( ValueType * dst ) const noexcept
|
||||
{ DeduceFinal<>::final( m_functor , dst ); }
|
||||
|
||||
Reducer( Reducer const & ) = default ;
|
||||
Reducer( Reducer && ) = default ;
|
||||
Reducer & operator = ( Reducer const & ) = delete ;
|
||||
Reducer & operator = ( Reducer && ) = delete ;
|
||||
|
||||
template< class S >
|
||||
using rebind = Reducer< S > ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit constexpr
|
||||
Reducer( Functor const * arg_functor = 0
|
||||
, ValueType * arg_value = 0 ) noexcept
|
||||
: m_functor(arg_functor), m_result(arg_value) {}
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
@ -44,11 +44,10 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -397,3 +396,4 @@ void unlock_address_hbw_space(void* ptr) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_ENABLE_CUDA )
|
||||
@ -85,10 +86,10 @@
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
#include <cstddef>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -98,6 +99,10 @@
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#if ( defined( KOKKOS_ENABLE_ASM ) || defined ( KOKKOS_ENABLE_TM ) ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -490,16 +495,50 @@ void init_lock_array_host_space() {
|
||||
}
|
||||
|
||||
bool lock_address_host_space(void* ptr) {
|
||||
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
|
||||
const unsigned status = _xbegin();
|
||||
|
||||
if( _XBEGIN_STARTED == status ) {
|
||||
const int val = HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
|
||||
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK];
|
||||
|
||||
if( 0 == val ) {
|
||||
HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
|
||||
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 1;
|
||||
} else {
|
||||
_xabort( 1 );
|
||||
}
|
||||
|
||||
_xend();
|
||||
|
||||
return 1;
|
||||
} else {
|
||||
#endif
|
||||
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0 , 1);
|
||||
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void unlock_address_host_space(void* ptr) {
|
||||
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
|
||||
const unsigned status = _xbegin();
|
||||
|
||||
if( _XBEGIN_STARTED == status ) {
|
||||
HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
|
||||
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 0;
|
||||
} else {
|
||||
#endif
|
||||
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0);
|
||||
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -49,7 +49,6 @@
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_Reducer.hpp>
|
||||
#include <impl/Kokkos_FunctorAnalysis.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -507,8 +506,9 @@ public:
|
||||
const scratch_memory_space & thread_scratch(int) const
|
||||
{ return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }
|
||||
|
||||
//----------------------------------------
|
||||
//--------------------------------------------------------------------------
|
||||
// Team collectives
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
@ -519,22 +519,6 @@ public:
|
||||
{}
|
||||
#endif
|
||||
|
||||
template< class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_barrier( Closure const & f ) const noexcept
|
||||
{
|
||||
if ( m_data.team_rendezvous() ) {
|
||||
|
||||
// All threads have entered 'team_rendezvous'
|
||||
// only this thread returned from 'team_rendezvous'
|
||||
// with a return value of 'true'
|
||||
|
||||
f();
|
||||
|
||||
m_data.team_rendezvous_release();
|
||||
}
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
@ -613,8 +597,9 @@ public:
|
||||
|
||||
if ( 0 != m_data.m_team_rank ) {
|
||||
// Non-root copies to their local buffer:
|
||||
reducer.copy( (value_type*) m_data.team_reduce_local()
|
||||
, reducer.data() );
|
||||
/*reducer.copy( (value_type*) m_data.team_reduce_local()
|
||||
, reducer.data() );*/
|
||||
*((value_type*) m_data.team_reduce_local()) = reducer.reference();
|
||||
}
|
||||
|
||||
// Root does not overwrite shared memory until all threads arrive
|
||||
@ -630,19 +615,19 @@ public:
|
||||
value_type * const src =
|
||||
(value_type*) m_data.team_member(i)->team_reduce_local();
|
||||
|
||||
reducer.join( reducer.data() , src );
|
||||
reducer.join( reducer.reference(), *src);
|
||||
}
|
||||
|
||||
// Copy result to root member's buffer:
|
||||
reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
|
||||
|
||||
// reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
|
||||
*((value_type*) m_data.team_reduce()) = reducer.reference();
|
||||
m_data.team_rendezvous_release();
|
||||
// This thread released all other threads from 'team_rendezvous'
|
||||
// with a return value of 'false'
|
||||
}
|
||||
else {
|
||||
// Copy from root member's buffer:
|
||||
reducer.copy( reducer.data() , (value_type*) m_data.team_reduce() );
|
||||
reducer.reference() = *((value_type*) m_data.team_reduce());
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -652,7 +637,7 @@ public:
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
template< typename ValueType , class JoinOp >
|
||||
/*template< typename ValueType , class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType
|
||||
team_reduce( ValueType const & value
|
||||
@ -696,7 +681,7 @@ public:
|
||||
}
|
||||
#else
|
||||
{ Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
|
||||
#endif
|
||||
#endif*/
|
||||
|
||||
|
||||
template< typename T >
|
||||
@ -854,7 +839,7 @@ parallel_reduce
|
||||
, Reducer const & reducer
|
||||
)
|
||||
{
|
||||
reducer.init( reducer.data() );
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start
|
||||
; i < loop_boundaries.end
|
||||
@ -875,9 +860,9 @@ parallel_reduce
|
||||
, ValueType & result
|
||||
)
|
||||
{
|
||||
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > > reducer( & result );
|
||||
Kokkos::Experimental::Sum<ValueType> reducer( result );
|
||||
|
||||
reducer.init( reducer.data() );
|
||||
reducer.init( result );
|
||||
|
||||
for( iType i = loop_boundaries.start
|
||||
; i < loop_boundaries.end
|
||||
@ -888,7 +873,7 @@ parallel_reduce
|
||||
loop_boundaries.thread.team_reduce( reducer );
|
||||
}
|
||||
|
||||
template< typename iType, class Space
|
||||
/*template< typename iType, class Space
|
||||
, class Closure, class Joiner , typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
@ -910,7 +895,7 @@ void parallel_reduce
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_reduce( reducer );
|
||||
}
|
||||
}*/
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Inter-thread vector parallel_reduce.
|
||||
@ -923,15 +908,13 @@ void parallel_reduce
|
||||
*/
|
||||
template< typename iType, class Space , class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
|
||||
parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& result)
|
||||
{
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start ;
|
||||
i < loop_boundaries.end ;
|
||||
i += loop_boundaries.increment) {
|
||||
@ -939,6 +922,22 @@ void parallel_reduce
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Space , class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer)
|
||||
{
|
||||
reducer.init(reducer.reference());
|
||||
for( iType i = loop_boundaries.start ;
|
||||
i < loop_boundaries.end ;
|
||||
i += loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* Executes lambda(iType i, ValueType & val) for each i=[0..N)
|
||||
@ -961,9 +960,6 @@ void parallel_reduce
|
||||
const JoinType & join,
|
||||
ValueType& result)
|
||||
{
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start ;
|
||||
i < loop_boundaries.end ;
|
||||
i += loop_boundaries.increment ) {
|
||||
@ -1055,9 +1051,8 @@ template< class Space , class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
|
||||
{
|
||||
if ( single.team_member.team_rank() == 0 ) functor();
|
||||
// 'single' does not perform a barrier.
|
||||
// single.team_member.team_barrier( functor );
|
||||
if ( single.team_member.team_rank() == 0 ) functor();
|
||||
}
|
||||
|
||||
template< class Space , class FunctorType , typename ValueType >
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
|
||||
#define KOKKOS_MEMORY_FENCE_HPP
|
||||
namespace Kokkos {
|
||||
@ -108,4 +109,3 @@ void load_fence()
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -212,6 +212,12 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifndef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
|
||||
#ifndef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
|
||||
#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT KOKKOS_HAVE_PRAGMA_LOOPCOUNT
|
||||
@ -423,6 +429,12 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
|
||||
#ifndef KOKKOS_ENABLE_THREADS
|
||||
#define KOKKOS_ENABLE_THREADS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// Deprecated macros
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,13 +44,11 @@
|
||||
#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
|
||||
#define KOKKOS_PHYSICAL_LAYOUT_HPP
|
||||
|
||||
|
||||
#include <Kokkos_View.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
|
||||
struct PhysicalLayout {
|
||||
enum LayoutType {Left,Right,Scalar,Error};
|
||||
LayoutType layout_type;
|
||||
@ -71,3 +69,4 @@ struct PhysicalLayout {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -44,12 +44,14 @@
|
||||
#ifndef KOKKOSP_DEVICE_INFO_HPP
|
||||
#define KOKKOSP_DEVICE_INFO_HPP
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Profiling {
|
||||
|
||||
struct KokkosPDeviceInfo {
|
||||
uint32_t deviceID;
|
||||
};
|
||||
struct KokkosPDeviceInfo {
|
||||
uint32_t deviceID;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@ -41,197 +41,203 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <string.h>
|
||||
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <cstring>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Profiling {
|
||||
namespace Profiling {
|
||||
|
||||
SpaceHandle::SpaceHandle(const char* space_name) {
|
||||
strncpy(name,space_name,64);
|
||||
}
|
||||
SpaceHandle::SpaceHandle(const char* space_name) {
|
||||
strncpy(name,space_name,64);
|
||||
}
|
||||
|
||||
bool profileLibraryLoaded() {
|
||||
return (NULL != initProfileLibrary);
|
||||
}
|
||||
bool profileLibraryLoaded() {
|
||||
return (NULL != initProfileLibrary);
|
||||
}
|
||||
|
||||
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginForCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelFor(const uint64_t kernelID) {
|
||||
if(NULL != endForCallee) {
|
||||
Kokkos::fence();
|
||||
(*endForCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelScan(const uint64_t kernelID) {
|
||||
if(NULL != endScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*endScanCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelReduce(const uint64_t kernelID) {
|
||||
if(NULL != endReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*endReduceCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void pushRegion(const std::string& kName) {
|
||||
if( NULL != pushRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*pushRegionCallee)(kName.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void popRegion() {
|
||||
if( NULL != popRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*popRegionCallee)();
|
||||
}
|
||||
}
|
||||
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
(*allocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
|
||||
// Make sure initialize calls happens only once
|
||||
static int is_initialized = 0;
|
||||
if(is_initialized) return;
|
||||
is_initialized = 1;
|
||||
|
||||
void* firstProfileLibrary;
|
||||
|
||||
char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
|
||||
|
||||
// If we do not find a profiling library in the environment then exit
|
||||
// early.
|
||||
if( NULL == envProfileLibrary ) {
|
||||
return ;
|
||||
}
|
||||
|
||||
char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
|
||||
sprintf(envProfileCopy, "%s", envProfileLibrary);
|
||||
|
||||
char* profileLibraryName = strtok(envProfileCopy, ";");
|
||||
|
||||
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
|
||||
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
|
||||
|
||||
if(NULL == firstProfileLibrary) {
|
||||
std::cerr << "Error: Unable to load KokkosP library: " <<
|
||||
profileLibraryName << std::endl;
|
||||
} else {
|
||||
std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
|
||||
|
||||
// dlsym returns a pointer to an object, while we want to assign to pointer to function
|
||||
// A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
|
||||
auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
|
||||
beginForCallee = *((beginFunction*) &p1);
|
||||
auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
|
||||
beginScanCallee = *((beginFunction*) &p2);
|
||||
auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
|
||||
beginReduceCallee = *((beginFunction*) &p3);
|
||||
|
||||
auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
|
||||
endScanCallee = *((endFunction*) &p4);
|
||||
auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
|
||||
endForCallee = *((endFunction*) &p5);
|
||||
auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
|
||||
endReduceCallee = *((endFunction*) &p6);
|
||||
|
||||
auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
|
||||
initProfileLibrary = *((initFunction*) &p7);
|
||||
auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
|
||||
finalizeProfileLibrary = *((finalizeFunction*) &p8);
|
||||
|
||||
auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
|
||||
pushRegionCallee = *((pushFunction*) &p9);
|
||||
auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
|
||||
popRegionCallee = *((popFunction*) &p10);
|
||||
|
||||
auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
|
||||
allocateDataCallee = *((allocateDataFunction*) &p11);
|
||||
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
|
||||
deallocateDataCallee = *((deallocateDataFunction*) &p12);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if(NULL != initProfileLibrary) {
|
||||
(*initProfileLibrary)(0,
|
||||
(uint64_t) KOKKOSP_INTERFACE_VERSION,
|
||||
(uint32_t) 0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
free(envProfileCopy);
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
// Make sure finalize calls happens only once
|
||||
static int is_finalized = 0;
|
||||
if(is_finalized) return;
|
||||
is_finalized = 1;
|
||||
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
|
||||
}
|
||||
}
|
||||
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginForCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelFor(const uint64_t kernelID) {
|
||||
if(NULL != endForCallee) {
|
||||
Kokkos::fence();
|
||||
(*endForCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelScan(const uint64_t kernelID) {
|
||||
if(NULL != endScanCallee) {
|
||||
Kokkos::fence();
|
||||
(*endScanCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
|
||||
if(NULL != beginReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
void endParallelReduce(const uint64_t kernelID) {
|
||||
if(NULL != endReduceCallee) {
|
||||
Kokkos::fence();
|
||||
(*endReduceCallee)(kernelID);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void pushRegion(const std::string& kName) {
|
||||
if( NULL != pushRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*pushRegionCallee)(kName.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
void popRegion() {
|
||||
if( NULL != popRegionCallee ) {
|
||||
Kokkos::fence();
|
||||
(*popRegionCallee)();
|
||||
}
|
||||
}
|
||||
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
(*allocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
|
||||
if(NULL != allocateDataCallee) {
|
||||
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
|
||||
}
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
|
||||
// Make sure initialize calls happens only once
|
||||
static int is_initialized = 0;
|
||||
if(is_initialized) return;
|
||||
is_initialized = 1;
|
||||
|
||||
void* firstProfileLibrary;
|
||||
|
||||
char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
|
||||
|
||||
// If we do not find a profiling library in the environment then exit
|
||||
// early.
|
||||
if( NULL == envProfileLibrary ) {
|
||||
return ;
|
||||
}
|
||||
|
||||
char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
|
||||
sprintf(envProfileCopy, "%s", envProfileLibrary);
|
||||
|
||||
char* profileLibraryName = strtok(envProfileCopy, ";");
|
||||
|
||||
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
|
||||
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
|
||||
|
||||
if(NULL == firstProfileLibrary) {
|
||||
std::cerr << "Error: Unable to load KokkosP library: " <<
|
||||
profileLibraryName << std::endl;
|
||||
} else {
|
||||
#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT
|
||||
std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
|
||||
#endif
|
||||
|
||||
// dlsym returns a pointer to an object, while we want to assign to pointer to function
|
||||
// A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
|
||||
auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
|
||||
beginForCallee = *((beginFunction*) &p1);
|
||||
auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
|
||||
beginScanCallee = *((beginFunction*) &p2);
|
||||
auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
|
||||
beginReduceCallee = *((beginFunction*) &p3);
|
||||
|
||||
auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
|
||||
endScanCallee = *((endFunction*) &p4);
|
||||
auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
|
||||
endForCallee = *((endFunction*) &p5);
|
||||
auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
|
||||
endReduceCallee = *((endFunction*) &p6);
|
||||
|
||||
auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
|
||||
initProfileLibrary = *((initFunction*) &p7);
|
||||
auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
|
||||
finalizeProfileLibrary = *((finalizeFunction*) &p8);
|
||||
|
||||
auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
|
||||
pushRegionCallee = *((pushFunction*) &p9);
|
||||
auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
|
||||
popRegionCallee = *((popFunction*) &p10);
|
||||
|
||||
auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
|
||||
allocateDataCallee = *((allocateDataFunction*) &p11);
|
||||
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
|
||||
deallocateDataCallee = *((deallocateDataFunction*) &p12);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if(NULL != initProfileLibrary) {
|
||||
(*initProfileLibrary)(0,
|
||||
(uint64_t) KOKKOSP_INTERFACE_VERSION,
|
||||
(uint32_t) 0,
|
||||
NULL);
|
||||
}
|
||||
|
||||
free(envProfileCopy);
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
// Make sure finalize calls happens only once
|
||||
static int is_finalized = 0;
|
||||
if(is_finalized) return;
|
||||
is_finalized = 1;
|
||||
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_IMPL_PROFILING_INTERFACE_PREVENT_LINK_ERROR() {}
|
||||
#endif
|
||||
|
||||
|
||||
@ -44,108 +44,108 @@
|
||||
#ifndef KOKKOSP_INTERFACE_HPP
|
||||
#define KOKKOSP_INTERFACE_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
|
||||
#include <cstddef>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <string>
|
||||
#include <cinttypes>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
|
||||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#include <cstdlib>
|
||||
|
||||
#define KOKKOSP_INTERFACE_VERSION 20150628
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
namespace Kokkos {
|
||||
namespace Profiling {
|
||||
namespace Profiling {
|
||||
|
||||
struct SpaceHandle {
|
||||
SpaceHandle(const char* space_name);
|
||||
char name[64];
|
||||
};
|
||||
struct SpaceHandle {
|
||||
SpaceHandle(const char* space_name);
|
||||
char name[64];
|
||||
};
|
||||
|
||||
typedef void (*initFunction)(const int,
|
||||
const uint64_t,
|
||||
const uint32_t,
|
||||
KokkosPDeviceInfo*);
|
||||
typedef void (*finalizeFunction)();
|
||||
typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
|
||||
typedef void (*endFunction)(uint64_t);
|
||||
typedef void (*initFunction)(const int,
|
||||
const uint64_t,
|
||||
const uint32_t,
|
||||
KokkosPDeviceInfo*);
|
||||
typedef void (*finalizeFunction)();
|
||||
typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
|
||||
typedef void (*endFunction)(uint64_t);
|
||||
|
||||
typedef void (*pushFunction)(const char*);
|
||||
typedef void (*popFunction)();
|
||||
typedef void (*pushFunction)(const char*);
|
||||
typedef void (*popFunction)();
|
||||
|
||||
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
|
||||
|
||||
|
||||
static initFunction initProfileLibrary = NULL;
|
||||
static finalizeFunction finalizeProfileLibrary = NULL;
|
||||
static initFunction initProfileLibrary = NULL;
|
||||
static finalizeFunction finalizeProfileLibrary = NULL;
|
||||
|
||||
static beginFunction beginForCallee = NULL;
|
||||
static beginFunction beginScanCallee = NULL;
|
||||
static beginFunction beginReduceCallee = NULL;
|
||||
static endFunction endForCallee = NULL;
|
||||
static endFunction endScanCallee = NULL;
|
||||
static endFunction endReduceCallee = NULL;
|
||||
static beginFunction beginForCallee = NULL;
|
||||
static beginFunction beginScanCallee = NULL;
|
||||
static beginFunction beginReduceCallee = NULL;
|
||||
static endFunction endForCallee = NULL;
|
||||
static endFunction endScanCallee = NULL;
|
||||
static endFunction endReduceCallee = NULL;
|
||||
|
||||
static pushFunction pushRegionCallee = NULL;
|
||||
static popFunction popRegionCallee = NULL;
|
||||
static pushFunction pushRegionCallee = NULL;
|
||||
static popFunction popRegionCallee = NULL;
|
||||
|
||||
static allocateDataFunction allocateDataCallee = NULL;
|
||||
static deallocateDataFunction deallocateDataCallee = NULL;
|
||||
static allocateDataFunction allocateDataCallee = NULL;
|
||||
static deallocateDataFunction deallocateDataCallee = NULL;
|
||||
|
||||
|
||||
bool profileLibraryLoaded();
|
||||
bool profileLibraryLoaded();
|
||||
|
||||
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelFor(const uint64_t kernelID);
|
||||
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelScan(const uint64_t kernelID);
|
||||
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelReduce(const uint64_t kernelID);
|
||||
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelFor(const uint64_t kernelID);
|
||||
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelScan(const uint64_t kernelID);
|
||||
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
|
||||
void endParallelReduce(const uint64_t kernelID);
|
||||
|
||||
void pushRegion(const std::string& kName);
|
||||
void popRegion();
|
||||
void pushRegion(const std::string& kName);
|
||||
void popRegion();
|
||||
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
|
||||
|
||||
void initialize();
|
||||
void finalize();
|
||||
void initialize();
|
||||
void finalize();
|
||||
|
||||
//Define finalize_fake inline to get rid of warnings for unused static variables
|
||||
inline void finalize_fake() {
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
//Define finalize_fake inline to get rid of warnings for unused static variables
|
||||
inline void finalize_fake() {
|
||||
if(NULL != finalizeProfileLibrary) {
|
||||
(*finalizeProfileLibrary)();
|
||||
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
}
|
||||
}
|
||||
// Set all profile hooks to NULL to prevent
|
||||
// any additional calls. Once we are told to
|
||||
// finalize, we mean it
|
||||
beginForCallee = NULL;
|
||||
beginScanCallee = NULL;
|
||||
beginReduceCallee = NULL;
|
||||
endScanCallee = NULL;
|
||||
endForCallee = NULL;
|
||||
endReduceCallee = NULL;
|
||||
|
||||
allocateDataCallee = NULL;
|
||||
deallocateDataCallee = NULL;
|
||||
|
||||
initProfileLibrary = NULL;
|
||||
finalizeProfileLibrary = NULL;
|
||||
pushRegionCallee = NULL;
|
||||
popRegionCallee = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user