Updating Kokkos lib to v2.03.05

This commit is contained in:
Stan Moore
2017-06-08 10:42:08 -06:00
parent 286d4f2743
commit 5c1d17d1c0
474 changed files with 50972 additions and 10897 deletions

View File

@ -44,14 +44,12 @@
#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
#include <Kokkos_Macros.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
#include <iostream>
#include <algorithm>
#include <stdio.h>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
#include <cstdio>
#include <utility>
@ -1298,3 +1296,4 @@ protected:
#endif
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,11 +45,10 @@
#define KOKKOS_CUDAEXEC_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_CUDA
#include <string>
#include <cstdint>
#include <Kokkos_Parallel.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp>
@ -99,6 +98,8 @@ CudaSpace::size_type cuda_internal_maximum_warp_count();
CudaSpace::size_type cuda_internal_maximum_grid_count();
CudaSpace::size_type cuda_internal_maximum_shared_words();
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
@ -146,7 +147,7 @@ Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
}
}
@ -319,3 +320,4 @@ struct CudaParallelLaunch< DriverType , false > {
#endif /* defined( __CUDACC__ ) */
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */

View File

@ -41,16 +41,15 @@
//@HEADER
*/
#include <stdlib.h>
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <atomic>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_Core.hpp>
#include <Kokkos_Cuda.hpp>
@ -890,9 +889,9 @@ void init_lock_arrays_cuda_space() {
}
}
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
static void* ptr = NULL;
static size_t current_size = 0;
static std::int64_t current_size = 0;
if(current_size == 0) {
current_size = bytes;
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
@ -911,5 +910,7 @@ void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
}
}
#else
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,8 +45,6 @@
#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_CUDA
#include <impl/Kokkos_Traits.hpp>
@ -75,7 +73,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
new( functor ) DestructFunctor( arg_destruct );
record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
return record ;
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,8 +45,6 @@
#define KOKKOS_CUDA_ERROR_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_CUDA
namespace Kokkos { namespace Impl {
@ -67,3 +65,4 @@ inline void cuda_internal_safe_call( cudaError e , const char * name, const char
#endif //KOKKOS_ENABLE_CUDA
#endif //KOKKOS_CUDA_ERROR_HPP

View File

@ -44,11 +44,11 @@
/*--------------------------------------------------------------------------*/
/* Kokkos interfaces */
#include <Kokkos_Core.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
@ -56,7 +56,7 @@
/*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */
#include <stdlib.h>
#include <cstdlib>
/* Standard 'C++' libraries */
#include <vector>
@ -404,9 +404,23 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
// Query what compute capability architecture a kernel executes:
m_cudaArch = cuda_kernel_arch();
if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
int compiled_major = m_cudaArch / 100;
int compiled_minor = ( m_cudaArch % 100 ) / 10;
if ( compiled_major < 5 && cudaProp.major >= 5 ) {
std::stringstream ss;
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
<< compiled_major << "." << compiled_minor
<< " (< 5.0) on device with compute capability "
<< cudaProp.major << "." << cudaProp.minor
<< " (>=5.0), this would give incorrect results!"
<< std::endl ;
std::string msg = ss.str();
Kokkos::abort( msg.c_str() );
}
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
<< ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
<< compiled_major << "." << compiled_minor
<< " on device with compute capability "
<< cudaProp.major << "." << cudaProp.minor
<< " , this will likely reduce potential performance."
@ -661,6 +675,15 @@ void CudaInternal::finalize()
Cuda::size_type cuda_internal_multiprocessor_count()
{ return CudaInternal::singleton().m_multiProcCount ; }
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
{
// Compute capability 5.0 through 6.2
enum : int { max_resident_blocks_per_multiprocessor = 32 };
return CudaInternal::singleton().m_multiProcCount
* max_resident_blocks_per_multiprocessor ;
};
Cuda::size_type cuda_internal_maximum_warp_count()
{ return CudaInternal::singleton().m_maxWarpCount ; }
@ -772,8 +795,10 @@ void Cuda::fence()
Kokkos::Impl::cuda_device_synchronize();
}
const char* Cuda::name() { return "Cuda"; }
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
#endif // KOKKOS_ENABLE_CUDA
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,19 +36,18 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_INTERNAL_HPP
#define KOKKOS_CUDA_INTERNAL_HPP
#include<iostream>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_CUDA
#include<iostream>
#include <Cuda/Kokkos_Cuda_Error.hpp>
namespace Kokkos { namespace Impl {

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -45,8 +45,6 @@
#define KOKKOS_CUDA_REDUCESCAN_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <utility>
@ -63,6 +61,7 @@ namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// Shuffle operations require input to be a register (stack) variable
template< typename T >
__device__ inline
@ -140,77 +139,6 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
}
}
//----------------------------------------------------------------------------
/** \brief Reduce within a warp over blockDim.x, the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Requires:
* blockDim.x is power of 2
* blockDim.x <= 32 (one warp)
*
* Cannot use "butterfly" pattern because floating point
* addition is non-associative. Therefore, must broadcast
* the final result.
*/
template< class Reducer >
__device__ inline
void cuda_intra_warp_vector_reduce( Reducer const & reducer )
{
static_assert(
std::is_reference< typename Reducer::reference_type >::value , "" );
if ( 1 < blockDim.x ) {
typename Reducer::value_type tmp ;
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
}
// Broadcast from root "lane" to all other "lanes"
cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
}
}
/** \brief Inclusive scan over blockDim.x, the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Algorithm is concurrent bottom-up reductions in triangular pattern
* where each CUDA thread is the root of a reduction tree from the
* zeroth CUDA thread to itself.
*
* Requires:
* blockDim.x is power of 2
* blockDim.x <= 32 (one warp)
*/
template< typename ValueType >
__device__ inline
void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
{
ValueType tmp ;
// Bottom up:
// [t] += [t-1] if t >= 1
// [t] += [t-2] if t >= 2
// [t] += [t-4] if t >= 4
// ...
for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
cuda_shfl_up( tmp , local , i , blockDim.x );
if ( i <= threadIdx.x ) { local += tmp ; }
}
}
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
@ -247,12 +175,12 @@ inline void cuda_inter_warp_reduction( ValueType& value,
#define STEP_WIDTH 4
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
// The reason not to use ValueType directly is that for types with constructors it
// The reason not to use ValueType directly is that for types with constructors it
// could lead to race conditions
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
ValueType* result = (ValueType*) & sh_result;
const unsigned step = 32 / blockDim.x;
unsigned shift = STEP_WIDTH;
const int step = 32 / blockDim.x;
int shift = STEP_WIDTH;
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
if(id < STEP_WIDTH ) {
result[id] = value;
@ -297,7 +225,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
//Do the intra-block reduction with shfl operations and static shared memory
cuda_intra_block_reduction(value,join,max_active_thread);
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
const int id = threadIdx.y*blockDim.x + threadIdx.x;
//One thread in the block writes block result to global scratch_memory
if(id == 0 ) {
@ -329,35 +257,35 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
//Reduce all global values with splitting work over threads in one warp
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
for(int i=id; i<gridDim.x; i+=step_size) {
for(int i=id; i<(int)gridDim.x; i+=step_size) {
value_type tmp = global[i];
join(value, tmp);
}
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
if (blockDim.x*blockDim.y > 1) {
if (int(blockDim.x*blockDim.y) > 1) {
value_type tmp = Kokkos::shfl_down(value, 1,32);
if( id + 1 < gridDim.x )
if( id + 1 < int(gridDim.x) )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 2) {
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < gridDim.x )
if( id + 2 < int(gridDim.x) )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 4) {
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < gridDim.x )
if( id + 4 < int(gridDim.x) )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 8) {
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < gridDim.x )
if( id + 8 < int(gridDim.x) )
join(value, tmp);
}
if (blockDim.x*blockDim.y > 16) {
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < gridDim.x )
if( id + 16 < int(gridDim.x) )
join(value, tmp);
}
}
@ -370,6 +298,166 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
#endif
}
template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_warp_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
unsigned int shift = 1;
ValueType result = reducer.reference();
//Reduce over values from threads with different threadIdx.y
while(blockDim.x * shift < 32 ) {
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
//Only join if upper thread is active (this allows non power of two for blockDim.y
if(threadIdx.y + shift < max_active_thread)
reducer.join(result , tmp);
shift*=2;
}
result = shfl(result,0,32);
reducer.reference() = result;
}
template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_inter_warp_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
typedef typename ReducerType::value_type ValueType;
#define STEP_WIDTH 4
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
// The reason not to use ValueType directly is that for types with constructors it
// could lead to race conditions
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
ValueType* result = (ValueType*) & sh_result;
ValueType value = reducer.reference();
const int step = 32 / blockDim.x;
int shift = STEP_WIDTH;
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
if(id < STEP_WIDTH ) {
result[id] = value;
}
__syncthreads();
while (shift<=max_active_thread/step) {
if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
reducer.join(result[id%STEP_WIDTH],value);
}
__syncthreads();
shift+=STEP_WIDTH;
}
value = result[0];
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
reducer.join(value,result[i]);
reducer.reference() = value;
}
template< class ReducerType >
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
cuda_intra_block_reduction( const ReducerType& reducer,
const int max_active_thread = blockDim.y) {
cuda_intra_warp_reduction(reducer,max_active_thread);
cuda_inter_warp_reduction(reducer,max_active_thread);
}
template< class ReducerType>
__device__ inline
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value , bool >::type
cuda_inter_block_reduction( const ReducerType& reducer,
Cuda::size_type * const m_scratch_space,
Cuda::size_type * const m_scratch_flags,
const int max_active_thread = blockDim.y) {
#ifdef __CUDA_ARCH__
typedef typename ReducerType::value_type* pointer_type;
typedef typename ReducerType::value_type value_type;
//Do the intra-block reduction with shfl operations and static shared memory
cuda_intra_block_reduction(reducer,max_active_thread);
value_type value = reducer.reference();
const int id = threadIdx.y*blockDim.x + threadIdx.x;
//One thread in the block writes block result to global scratch_memory
if(id == 0 ) {
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
*global = value;
}
//One warp of last block performs inter block reduction through loading the block values from global scratch_memory
bool last_block = false;
__syncthreads();
if ( id < 32 ) {
Cuda::size_type count;
//Figure out whether this is the last block
if(id == 0)
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
count = Kokkos::shfl(count,0,32);
//Last block does the inter block reduction
if( count == gridDim.x - 1) {
//set flag back to zero
if(id == 0)
*m_scratch_flags = 0;
last_block = true;
reducer.init(value);
pointer_type const volatile global = (pointer_type) m_scratch_space ;
//Reduce all global values with splitting work over threads in one warp
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
for(int i=id; i<(int)gridDim.x; i+=step_size) {
value_type tmp = global[i];
reducer.join(value, tmp);
}
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
if (int(blockDim.x*blockDim.y) > 1) {
value_type tmp = Kokkos::shfl_down(value, 1,32);
if( id + 1 < int(gridDim.x) )
reducer.join(value, tmp);
}
if (int(blockDim.x*blockDim.y) > 2) {
value_type tmp = Kokkos::shfl_down(value, 2,32);
if( id + 2 < int(gridDim.x) )
reducer.join(value, tmp);
}
if (int(blockDim.x*blockDim.y) > 4) {
value_type tmp = Kokkos::shfl_down(value, 4,32);
if( id + 4 < int(gridDim.x) )
reducer.join(value, tmp);
}
if (int(blockDim.x*blockDim.y) > 8) {
value_type tmp = Kokkos::shfl_down(value, 8,32);
if( id + 8 < int(gridDim.x) )
reducer.join(value, tmp);
}
if (int(blockDim.x*blockDim.y) > 16) {
value_type tmp = Kokkos::shfl_down(value, 16,32);
if( id + 16 < int(gridDim.x) )
reducer.join(value, tmp);
}
}
}
//The last block has in its thread=0 the global reduction value through "value"
return last_block;
#else
return true;
#endif
}
//----------------------------------------------------------------------------
// See section B.17 of Cuda C Programming Guide Version 3.2
// for discussion of
@ -529,11 +617,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ;
#if (__CUDA_ARCH__ < 500)
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
#else
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
#endif
//#if (__CUDA_ARCH__ < 500)
for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
//#else
// for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
//#endif
}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,15 +36,16 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
@ -120,7 +121,7 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
}
if ( 0 == warp_lane ) {
queue->complete( task.ptr );
queue->complete( task.ptr );
}
}
} while(1);
@ -157,7 +158,7 @@ printf("cuda_task_queue_execute before\n");
// If not large enough then set the stack size, in bytes:
//
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
CUDA_SAFE_CALL( cudaGetLastError() );
@ -173,7 +174,7 @@ printf("cuda_task_queue_execute after\n");
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#else
void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {}
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */

View File

@ -44,6 +44,7 @@
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
#define KOKKOS_IMPL_CUDA_TASK_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------
@ -325,7 +326,7 @@ ValueType shfl_warp_broadcast
return Kokkos::shfl(val, src_lane, width);
}
// all-reduce across corresponding vector lanes between team members within warp
/*// all-reduce across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
@ -351,7 +352,7 @@ void parallel_reduce
loop_boundaries.thread.team_size(),
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
}*/
// all-reduce across corresponding vector lanes between team members within warp
// if no join() provided, use sum
@ -382,13 +383,36 @@ void parallel_reduce
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const ReducerType& reducer) {
typedef typename ReducerType::value_type ValueType;
//TODO what is the point of creating this temporary?
ValueType result = ValueType();
reducer.init(result);
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
strided_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
result,
loop_boundaries.thread.team_size(),
blockDim.x);
reducer.reference() = shfl_warp_broadcast<ValueType>( result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
// all-reduce within team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
/*template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
@ -404,7 +428,7 @@ void parallel_reduce
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
}*/
// all-reduce within team members within warp
// if no join() provided, use sum
@ -436,6 +460,28 @@ void parallel_reduce
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const ReducerType& reducer) {
typedef typename ReducerType::value_type ValueType;
ValueType result = ValueType();
reducer.init(result);
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
multi_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1, val2); },
result,
blockDim.x);
reducer.reference() = shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
}
// scan across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride

View File

@ -0,0 +1,982 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_TEAM_HPP
#define KOKKOS_CUDA_TEAM_HPP
#include <iostream>
#include <algorithm>
#include <stdio.h>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <utility>
#include <Kokkos_Parallel.hpp>
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Kokkos_Vectorization.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< typename Type >
struct CudaJoinFunctor {
typedef Type value_type ;
KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & update ,
volatile const value_type & input )
{ update += input ; }
};
/**\brief Team member_type passed to TeamPolicy or TeamTask closures.
*
* Cuda thread blocks for team closures are dimensioned as:
* blockDim.x == number of "vector lanes" per "thread"
* blockDim.y == number of "threads" per team
* blockDim.z == number of teams in a block
* where
* A set of teams exactly fill a warp OR a team is the whole block
* ( 0 == WarpSize % ( blockDim.x * blockDim.y ) )
* OR
* ( 1 == blockDim.z )
*
* Thus when 1 < blockDim.z the team is warp-synchronous
* and __syncthreads should not be called in team collectives.
*
* When multiple teams are mapped onto a single block then the
* total available shared memory must be partitioned among teams.
*/
class CudaTeamMember {
private:
typedef Kokkos::Cuda execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
void * m_team_reduce ;
scratch_memory_space m_team_shared ;
int m_team_reduce_size ;
int m_league_rank ;
int m_league_size ;
public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space &
team_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space &
thread_scratch(const int& level) const
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const
{
#ifdef __CUDA_ARCH__
return threadIdx.y ;
#else
return 0;
#endif
}
KOKKOS_INLINE_FUNCTION int team_size() const
{
#ifdef __CUDA_ARCH__
return blockDim.y ;
#else
return 1;
#endif
}
KOKKOS_INLINE_FUNCTION void team_barrier() const
{
#ifdef __CUDA_ARCH__
if ( 1 == blockDim.z ) __syncthreads(); // team == block
else __threadfence_block(); // team <= warp
#endif
}
//--------------------------------------------------------------------------
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast( ValueType & val, const int& thread_id) const
{
#ifdef __CUDA_ARCH__
if ( 1 == blockDim.z ) { // team == block
__syncthreads();
// Wait for shared data write until all threads arrive here
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
*((ValueType*) m_team_reduce) = val ;
}
__syncthreads(); // Wait for shared data read until root thread writes
val = *((ValueType*) m_team_reduce);
}
else { // team <= warp
ValueType tmp( val ); // input might not be a register variable
cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
}
#endif
}
//--------------------------------------------------------------------------
/**\brief Reduction across a team
*
* Mapping of teams onto blocks:
* blockDim.x is "vector lanes"
* blockDim.y is team "threads"
* blockDim.z is number of teams per block
*
* Requires:
* blockDim.x is power two
* blockDim.x <= CudaTraits::WarpSize
* ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y )
* OR
* ( 1 == blockDim.z )
*/
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
team_reduce( ReducerType const & reducer ) const noexcept
{
#ifdef __CUDA_ARCH__
typedef typename ReducerType::value_type value_type ;
value_type tmp( reducer.reference() );
// reduce within the warp using shuffle
const int wx =
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
// Root of each vector lane reduces:
if ( 0 == threadIdx.x && wx < i ) {
reducer.join( tmp , reducer.reference() );
}
}
if ( 1 < blockDim.z ) { // team <= warp
// broadcast result from root vector lange of root thread
cuda_shfl( reducer.reference() , tmp
, blockDim.x * threadIdx.y , CudaTraits::WarpSize );
}
else { // team == block
// Reduce across warps using shared memory
// Broadcast result within block
// Number of warps, blockDim.y may not be power of two:
const int nw = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
// Warp index:
const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
// Number of shared memory entries for the reduction:
int nsh = m_team_reduce_size / sizeof(value_type);
// Using at most one entry per warp:
if ( nw < nsh ) nsh = nw ;
__syncthreads(); // Wait before shared data write
if ( 0 == wx && wy < nsh ) {
((value_type*) m_team_reduce)[wy] = tmp ;
}
// When more warps than shared entries:
for ( int i = nsh ; i < nw ; i += nsh ) {
__syncthreads();
if ( 0 == wx && i <= wy ) {
const int k = wy - i ;
if ( k < nsh ) {
reducer.join( *((value_type*) m_team_reduce + k) , tmp );
}
}
}
__syncthreads();
// One warp performs the inter-warp reduction:
if ( 0 == wy ) {
// Start at power of two covering nsh
for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
const int k = wx + i ;
if ( wx < i && k < nsh ) {
reducer.join( ((value_type*)m_team_reduce)[wx]
, ((value_type*)m_team_reduce)[k] );
__threadfence_block();
}
}
}
__syncthreads(); // Wait for reduction
// Broadcast result to all threads
reducer.reference() = *((value_type*)m_team_reduce);
}
#endif /* #ifdef __CUDA_ARCH__ */
}
//--------------------------------------------------------------------------
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION
Type team_scan( const Type & value , Type * const global_accum ) const
{
#ifdef __CUDA_ARCH__
Type * const base_data = (Type *) m_team_reduce ;
__syncthreads(); // Don't write in to shared data until all threads have entered this function
if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
base_data[ threadIdx.y + 1 ] = value ;
Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
if ( global_accum ) {
if ( blockDim.y == threadIdx.y + 1 ) {
base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
}
__syncthreads(); // Wait for atomic
base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
}
return base_data[ threadIdx.y ];
#else
return Type();
#endif
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
return this->template team_scan<Type>( value , 0 );
}
//----------------------------------------
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value >::type
vector_reduce( ReducerType const & reducer )
{
#ifdef __CUDA_ARCH__
if(blockDim.x == 1) return;
// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp ( reducer.reference() );
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
}
// Broadcast from root lane to all other lanes.
// Cannot use "butterfly" algorithm to avoid the broadcast
// because floating point summation is not associative
// and thus different threads could have different results.
cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x );
#endif
}
//--------------------------------------------------------------------------
/**\brief Global reduction across all blocks
*
* Return !0 if reducer contains the final value
*/
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value , int >::type
global_reduce( ReducerType const & reducer
, int * const global_scratch_flags
, void * const global_scratch_space
, void * const shmem
, int const shmem_size
)
{
#ifdef __CUDA_ARCH__
typedef typename ReducerType::value_type value_type ;
typedef value_type volatile * pointer_type ;
// Number of shared memory entries for the reduction:
const int nsh = shmem_size / sizeof(value_type);
// Number of CUDA threads in the block, rank within the block
const int nid = blockDim.x * blockDim.y * blockDim.z ;
const int tid = threadIdx.x + blockDim.x * (
threadIdx.y + blockDim.y * threadIdx.z );
// Reduces within block using all available shared memory
// Contributes if it is the root "vector lane"
// wn == number of warps in the block
// wx == which lane within the warp
// wy == which warp within the block
const int wn = ( nid + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
const int wx = tid & CudaTraits::WarpIndexMask ;
const int wy = tid >> CudaTraits::WarpIndexShift ;
//------------------------
{ // Intra warp shuffle reduction from contributing CUDA threads
value_type tmp( reducer.reference() );
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
// Root of each vector lane reduces "thread" contribution
if ( 0 == threadIdx.x && wx < i ) {
reducer.join( & tmp , reducer.data() );
}
}
// Reduce across warps using shared memory.
// Number of warps may not be power of two.
__syncthreads(); // Wait before shared data write
// Number of shared memory entries for the reduction
// is at most one per warp
const int nentry = wn < nsh ? wn : nsh ;
if ( 0 == wx && wy < nentry ) {
// Root thread of warp 'wy' has warp's value to contribute
((value_type*) shmem)[wy] = tmp ;
}
__syncthreads(); // Wait for write to be visible to block
// When more warps than shared entries
// then warps must take turns joining their contribution
// to the designated shared memory entry.
for ( int i = nentry ; i < wn ; i += nentry ) {
const int k = wy - i ;
if ( 0 == wx && i <= wy && k < nentry ) {
// Root thread of warp 'wy' has warp's value to contribute
reducer.join( ((value_type*) shmem) + k , & tmp );
}
__syncthreads(); // Wait for write to be visible to block
}
// One warp performs the inter-warp reduction:
if ( 0 == wy ) {
// Start fan-in at power of two covering nentry
for ( int i = ( 1 << ( 32 - __clz(nentry-1) ) ) ; ( i >>= 1 ) ; ) {
const int k = wx + i ;
if ( wx < i && k < nentry ) {
reducer.join( ((pointer_type)shmem) + wx
, ((pointer_type)shmem) + k );
__threadfence_block(); // Wait for write to be visible to warp
}
}
}
}
//------------------------
{ // Write block's value to global_scratch_memory
int last_block = 0 ;
if ( 0 == wx ) {
reducer.copy( ((pointer_type)global_scratch_space)
+ blockIdx.x * reducer.length()
, reducer.data() );
__threadfence(); // Wait until global write is visible.
last_block = gridDim.x ==
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
// If last block then reset count
if ( last_block ) *global_scratch_flags = 0 ;
}
last_block = __syncthreads_or( last_block );
if ( ! last_block ) return 0 ;
}
//------------------------
// Last block reads global_scratch_memory into shared memory.
const int nentry = nid < gridDim.x ?
( nid < nsh ? nid : nsh ) :
( gridDim.x < nsh ? gridDim.x : nsh ) ;
// nentry = min( nid , nsh , gridDim.x )
// whole block reads global memory into shared memory:
if ( tid < nentry ) {
const int offset = tid * reducer.length();
reducer.copy( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space) + offset );
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
reducer.join( ((pointer_type)shmem) + offset
, ((pointer_type)global_scratch_space)
+ i * reducer.length() );
}
}
__syncthreads(); // Wait for writes to be visible to block
if ( 0 == wy ) {
// Iterate to reduce shared memory to single warp fan-in size
const int nreduce = CudaTraits::WarpSize < nentry
? CudaTraits::WarpSize : nentry ;
// nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
if ( wx < nreduce && nreduce < nentry ) {
for ( int i = nreduce + wx ; i < nentry ; i += nreduce ) {
reducer.join( ((pointer_type)shmem) + wx
, ((pointer_type)shmem) + i );
}
__threadfence_block(); // Wait for writes to be visible to warp
}
// Start fan-in at power of two covering nentry
for ( int i = ( 1 << ( 32 - __clz(nreduce-1) ) ) ; ( i >>= 1 ) ; ) {
const int k = wx + i ;
if ( wx < i && k < nreduce ) {
reducer.join( ((pointer_type)shmem) + wx
, ((pointer_type)shmem) + k );
__threadfence_block(); // Wait for writes to be visible to warp
}
}
if ( 0 == wx ) {
reducer.copy( reducer.data() , (pointer_type)shmem );
return 1 ;
}
}
return 0 ;
#else
return 0 ;
#endif
}
//----------------------------------------
// Private for the driver
KOKKOS_INLINE_FUNCTION
CudaTeamMember( void * shared
, const int shared_begin
, const int shared_size
, void* scratch_level_1_ptr
, const int scratch_level_1_size
, const int arg_league_rank
, const int arg_league_size )
: m_team_reduce( shared )
, m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size)
, m_team_reduce_size( shared_begin )
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
{}
};
} // namspace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
typedef iType index_type;
const CudaTeamMember& member;
const iType start;
const iType end;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
: member(thread_)
, start( 0 )
, end( count ) {}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
: member(thread_)
, start( begin_ )
, end( end_ ) {}
};
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
typedef iType index_type;
const iType start;
const iType end;
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count)
: start( 0 ), end( count ) {}
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const iType& count)
: start( 0 ), end( count ) {}
};
} // namespace Impl
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
}
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::CudaTeamMember >
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
}
//----------------------------------------------------------------------------
/** \brief Inter-thread parallel_for.
*
* Executes closure(iType i) for each i=[0..N).
*
* The range [0..N) is mapped to all threads of the the calling thread team.
*/
template<typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
loop_boundaries
, const Closure & closure
)
{
#ifdef __CUDA_ARCH__
for( iType i = loop_boundaries.start + threadIdx.y
; i < loop_boundaries.end
; i += blockDim.y )
closure(i);
#endif
}
//----------------------------------------------------------------------------
/** \brief Inter-thread parallel_reduce with a reducer.
*
* Executes closure(iType i, ValueType & val) for each i=[0..N)
*
* The range [0..N) is mapped to all threads of the
* calling thread team and a summation of val is
* performed and put into result.
*/
template< typename iType, class Closure, class ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
loop_boundaries
, const Closure & closure
, const ReducerType & reducer
)
{
#ifdef __CUDA_ARCH__
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start + threadIdx.y
; i < loop_boundaries.end
; i += blockDim.y ) {
closure(i,reducer.reference());
}
loop_boundaries.member.team_reduce( reducer );
#endif
}
/** \brief Inter-thread parallel_reduce assuming summation.
*
* Executes closure(iType i, ValueType & val) for each i=[0..N)
*
* The range [0..N) is mapped to all threads of the
* calling thread team and a summation of val is
* performed and put into result.
*/
template< typename iType, class Closure, typename ValueType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
loop_boundaries
, const Closure & closure
, ValueType & result
)
{
#ifdef __CUDA_ARCH__
Kokkos::Experimental::Sum<ValueType> reducer(result);
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start + threadIdx.y
; i < loop_boundaries.end
; i += blockDim.y ) {
closure(i,result);
}
loop_boundaries.member.team_reduce( reducer );
#endif
}
//----------------------------------------------------------------------------
/** \brief Intra-thread vector parallel_for.
*
* Executes closure(iType i) for each i=[0..N)
*
* The range [0..N) is mapped to all vector lanes of the the calling thread.
*/
template<typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
loop_boundaries
, const Closure & closure
)
{
#ifdef __CUDA_ARCH__
for ( iType i = loop_boundaries.start + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.x ) {
closure(i);
}
#endif
}
//----------------------------------------------------------------------------
/** \brief Intra-thread vector parallel_reduce.
*
* Calls closure(iType i, ValueType & val) for each i=[0..N).
*
* The range [0..N) is mapped to all vector lanes of
* the calling thread and a reduction of val is performed using +=
* and output into result.
*
* The identity value for the += operator is assumed to be the default
* constructed value.
*/
template< typename iType, class Closure, class ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
parallel_reduce
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
const & loop_boundaries
, Closure const & closure
, ReducerType const & reducer )
{
#ifdef __CUDA_ARCH__
reducer.init( reducer.reference() );
for ( iType i = loop_boundaries.start + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.x ) {
closure(i,reducer.reference());
}
Impl::CudaTeamMember::vector_reduce( reducer );
#endif
}
/** \brief Intra-thread vector parallel_reduce.
*
* Calls closure(iType i, ValueType & val) for each i=[0..N).
*
* The range [0..N) is mapped to all vector lanes of
* the calling thread and a reduction of val is performed using +=
* and output into result.
*
* The identity value for the += operator is assumed to be the default
* constructed value.
*/
template< typename iType, class Closure, typename ValueType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< ! is_reducer< ValueType >::value >::type
parallel_reduce
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
const & loop_boundaries
, Closure const & closure
, ValueType & result )
{
#ifdef __CUDA_ARCH__
result = ValueType();
for ( iType i = loop_boundaries.start + threadIdx.x
; i < loop_boundaries.end
; i += blockDim.x ) {
closure(i,result);
}
Impl::CudaTeamMember::vector_reduce(
Kokkos::Experimental::Sum<ValueType>(result ) );
#endif
}
//----------------------------------------------------------------------------
/** \brief Intra-thread vector parallel exclusive prefix sum.
*
* Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
*
* The range [0..N) is mapped to all vector lanes in the
* thread and a scan operation is performed.
* The last call to closure has final == true.
*/
template< typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_scan
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
loop_boundaries
, const Closure & closure
)
{
#ifdef __CUDA_ARCH__
// Extract value_type from closure
using value_type =
typename Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::SCAN
, void
, Closure >::value_type ;
// Loop through boundaries by vector-length chunks
// must scan at each iteration
value_type accum = 0 ;
// All thread "lanes" must loop the same number of times.
// Determine an loop end for all thread "lanes."
// Requires:
// blockDim.x is power of two and thus
// ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
// 1 <= blockDim.x <= CudaTraits::WarpSize
const int mask = blockDim.x - 1 ;
const int rem = loop_boundaries.end & mask ; // == end % blockDim.x
const int end = loop_boundaries.end + ( rem ? blockDim.x - rem : 0 );
for ( int i = threadIdx.x ; i < end ; i += blockDim.x ) {
value_type val = 0 ;
// First acquire per-lane contributions:
if ( i < loop_boundaries.end ) closure( i , val , false );
value_type sval = val ;
// Bottom up inclusive scan in triangular pattern
// where each CUDA thread is the root of a reduction tree
// from the zeroth "lane" to itself.
// [t] += [t-1] if t >= 1
// [t] += [t-2] if t >= 2
// [t] += [t-4] if t >= 4
// ...
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
value_type tmp = 0 ;
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
if ( j <= threadIdx.x ) { sval += tmp ; }
}
// Include accumulation and remove value for exclusive scan:
val = accum + sval - val ;
// Provide exclusive scan value:
if ( i < loop_boundaries.end ) closure( i , val , true );
// Accumulate the last value in the inclusive scan:
Impl::cuda_shfl( sval , sval , mask , blockDim.x );
accum += sval ;
}
#endif
}
}
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda();
#endif
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
#endif
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0) lambda(val);
val = shfl(val,0,blockDim.x);
#endif
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
#ifdef __CUDA_ARCH__
if(threadIdx.x == 0 && threadIdx.y == 0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
#endif
}
} // namespace Kokkos
#endif /* defined( __CUDACC__ ) */
#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,8 +44,6 @@
#define KOKKOS_CUDA_VECTORIZATION_HPP
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_ENABLE_CUDA
#include <Kokkos_Cuda.hpp>
@ -296,3 +294,4 @@ namespace Impl {
#endif // KOKKOS_ENABLE_CUDA
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,7 +44,7 @@
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
/* only compile this file if CUDA is enabled for Kokkos */
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_CUDA )
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -46,7 +46,7 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include "Kokkos_Macros.hpp"
#include <Kokkos_Macros.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include <cuda.h>
@ -82,6 +82,8 @@ void cuda_abort( const char * const message )
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {}
#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */

View File

@ -236,13 +236,13 @@ struct MDRangePolicy
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
#else
if(m_lower.size()!=rank || m_upper.size() != rank)
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
for ( auto i = 0; i < rank; ++i ) {
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
if(tile.size()==rank)
if(static_cast<int>(tile.size())==rank)
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
else
m_tile[i] = 0;

View File

@ -44,6 +44,7 @@
#define KOKKOS_COMPLEX_HPP
#include <Kokkos_Atomic.hpp>
#include <Kokkos_NumericTraits.hpp>
#include <complex>
#include <iostream>
@ -324,9 +325,30 @@ public:
im_ /= src;
return *this;
}
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& src) {
return (re_ == src.re_) && (im_ == src.im_);
}
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType src) {
return (re_ == src) && (im_ == RealType(0));
}
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& src) {
return (re_ != src.re_) || (im_ != src.im_);
}
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType src) {
return (re_ != src) || (im_ != RealType(0));
}
};
//! Binary + operator for complex.
//! Binary + operator for complex complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
@ -334,6 +356,22 @@ operator + (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
}
//! Binary + operator for complex scalar.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () + y , x.imag ());
}
//! Binary + operator for scalar complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x + y.real (), y.imag ());
}
//! Unary + operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
@ -350,6 +388,22 @@ operator - (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
}
//! Binary - operator for complex scalar.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const RealType& y) {
return complex<RealType> (x.real () - y , x.imag ());
}
//! Binary - operator for scalar complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x - y.real (), - y.imag ());
}
//! Unary - operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
@ -395,6 +449,16 @@ operator * (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x * y.real (), x * y.imag ());
}
/// \brief Binary * operator for RealType times complex.
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& y, const RealType& x) {
return complex<RealType> (x * y.real (), x * y.imag ());
}
//! Imaginary part of a complex number.
template<class RealType>
@ -415,7 +479,25 @@ template<class RealType>
KOKKOS_INLINE_FUNCTION
RealType abs (const complex<RealType>& x) {
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
}
//! Power of a complex number
template<class RealType>
KOKKOS_INLINE_FUNCTION
Kokkos::complex<RealType> pow (const complex<RealType>& x, const RealType& e) {
RealType r = abs(x);
RealType phi = std::atan(x.imag()/x.real());
return std::pow(r,e) * Kokkos::complex<RealType>(std::cos(phi*e),std::sin(phi*e));
}
//! Square root of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
Kokkos::complex<RealType> sqrt (const complex<RealType>& x) {
RealType r = abs(x);
RealType phi = std::atan(x.imag()/x.real());
return std::sqrt(r) * Kokkos::complex<RealType>(std::cos(phi*0.5),std::sin(phi*0.5));
}
//! Conjugate of a complex number.
@ -425,6 +507,19 @@ complex<RealType> conj (const complex<RealType>& x) {
return complex<RealType> (real (x), -imag (x));
}
//! Exponential of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType> exp (const complex<RealType>& x) {
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
}
//! Exponential of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType> pow (const complex<RealType>& x) {
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
}
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
@ -461,6 +556,14 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
}
}
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
operator / (const RealType1& x, const complex<RealType2>& y) {
return complex<RealType1> (x)/y;
}
//! Equality operator for two complex numbers.
template<class RealType>
KOKKOS_INLINE_FUNCTION
@ -468,9 +571,13 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) == real (y) && imag (x) == imag (y);
}
//! Equality operator for std::complex and Kokkos::complex.
/// \brief Equality operator for std::complex and Kokkos::complex.
///
/// This cannot be a device function, since std::real is not.
/// Otherwise, CUDA builds will give compiler warnings ("warning:
/// calling a constexpr __host__ function("real") from a __host__
/// __device__ function("operator==") is not allowed").
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) == real (y) && std::imag (x) == imag (y);
}
@ -533,6 +640,15 @@ std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
}
template<class T>
struct reduction_identity<Kokkos::complex<T> > {
typedef reduction_identity<T> t_red_ident;
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> sum()
{return Kokkos::complex<T>(t_red_ident::sum(),t_red_ident::sum());}
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> prod()
{return Kokkos::complex<T>(t_red_ident::prod(),t_red_ident::sum());}
};
} // namespace Kokkos
#endif // KOKKOS_COMPLEX_HPP

View File

@ -200,7 +200,14 @@ public:
, Kokkos::DefaultHostExecutionSpace , execution_space
>::type host_execution_space ;
#else
typedef execution_space host_execution_space ;
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
typedef typename std::conditional
< std::is_same< execution_space , Kokkos::Experimental::OpenMPTarget >::value
, Kokkos::DefaultHostExecutionSpace , execution_space
>::type host_execution_space ;
#else
typedef execution_space host_execution_space ;
#endif
#endif
typedef typename std::conditional

View File

@ -57,11 +57,16 @@
#include <Kokkos_OpenMP.hpp>
#endif
//#if defined( KOKKOS_ENABLE_OPENMPTARGET )
#include <Kokkos_OpenMPTarget.hpp>
#include <Kokkos_OpenMPTargetSpace.hpp>
//#endif
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <Kokkos_Qthreads.hpp>
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
#include <Kokkos_Threads.hpp>
#endif
@ -69,8 +74,8 @@
#include <Kokkos_Cuda.hpp>
#endif
#include <Kokkos_MemoryPool.hpp>
#include <Kokkos_Pair.hpp>
#include <Kokkos_MemoryPool.hpp>
#include <Kokkos_Array.hpp>
#include <Kokkos_View.hpp>
#include <Kokkos_Vectorization.hpp>
@ -167,3 +172,4 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
//----------------------------------------------------------------------------
#endif

View File

@ -96,7 +96,7 @@ class Serial; ///< Execution space main process on CPU.
class Qthreads; ///< Execution space with Qthreads back-end.
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
class Threads; ///< Execution space with pthreads back-end.
#endif
@ -104,6 +104,14 @@ class Threads; ///< Execution space with pthreads back-end.
class OpenMP; ///< OpenMP execution space.
#endif
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
namespace Experimental {
class OpenMPTarget; ///< OpenMPTarget execution space.
class OpenMPTargetSpace;
}
#endif
#if defined( KOKKOS_ENABLE_CUDA )
class CudaSpace; ///< Memory space on Cuda GPU
class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM
@ -121,12 +129,14 @@ struct Device;
/// Define Kokkos::DefaultExecutionSpace as per configuration option
/// or chosen from the enabled execution spaces in the following order:
/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
namespace Kokkos {
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
typedef Cuda DefaultExecutionSpace;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@ -136,7 +146,7 @@ namespace Kokkos {
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultExecutionSpace;
#else
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
#endif
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
@ -149,7 +159,7 @@ namespace Kokkos {
typedef Serial DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_PTHREAD )
#elif defined( KOKKOS_ENABLE_THREADS )
typedef Threads DefaultHostExecutionSpace;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Qthreads DefaultHostExecutionSpace;
@ -254,6 +264,21 @@ template< class FunctorType, class ExecPolicy, class ExecutionSapce =
} // namespace Impl
namespace Experimental {
template<class ScalarType , class Space = HostSpace> struct Sum;
template<class ScalarType , class Space = HostSpace> struct Prod;
template<class ScalarType , class Space = HostSpace> struct Min;
template<class ScalarType , class Space = HostSpace> struct Max;
template<class ScalarType , class Space = HostSpace> struct MinMax;
template<class ScalarType , class Index, class Space = HostSpace> struct MinLoc;
template<class ScalarType , class Index, class Space = HostSpace> struct MaxLoc;
template<class ScalarType , class Index, class Space = HostSpace> struct MinMaxLoc;
template<class ScalarType , class Space = HostSpace> struct BAnd;
template<class ScalarType , class Space = HostSpace> struct BOr;
template<class ScalarType , class Space = HostSpace> struct LAnd;
template<class ScalarType , class Space = HostSpace> struct LOr;
}
} // namespace Kokkos
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */

View File

@ -44,12 +44,11 @@
#ifndef KOKKOS_CUDA_HPP
#define KOKKOS_CUDA_HPP
#include <Kokkos_Core_fwd.hpp>
// If CUDA execution space is enabled then use this header file.
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_CUDA )
#include <Kokkos_Core_fwd.hpp>
#include <iosfwd>
#include <vector>
@ -214,6 +213,8 @@ public:
//@}
//--------------------------------------------------------------------------
static const char* name();
private:
cudaStream_t m_stream ;
@ -291,6 +292,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Team.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>
@ -300,5 +302,3 @@ struct VerifyExecutionCanAccessMemorySpace
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
#endif /* #ifndef KOKKOS_CUDA_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,10 +44,11 @@
#ifndef KOKKOS_CUDASPACE_HPP
#define KOKKOS_CUDASPACE_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_CUDA )
#include <Kokkos_Core_fwd.hpp>
#include <iosfwd>
#include <typeinfo>
#include <string>
@ -717,7 +718,7 @@ private:
static ::cudaTextureObject_t
attach_texture_object( const unsigned sizeof_alias
, void * const alloc_ptr
, const size_t alloc_size );
, const size_t alloc_size );
static RecordBase s_root_record ;

View File

@ -565,5 +565,3 @@ ThreadVectorRange( const TeamMemberType&, const iType& count );
#endif /* #define KOKKOS_EXECPOLICY_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -44,12 +44,11 @@
#ifndef KOKKOS_HBWSPACE_HPP
#define KOKKOS_HBWSPACE_HPP
#include <Kokkos_HostSpace.hpp>
/*--------------------------------------------------------------------------*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_HBWSPACE
#include <Kokkos_HostSpace.hpp>
namespace Kokkos {
namespace Experimental {
@ -114,7 +113,7 @@ public:
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_PTHREAD )
#elif defined( KOKKOS_ENABLE_THREADS )
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
@ -348,5 +347,5 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokk
} // namespace Kokkos
#endif
#endif // #define KOKKOS_HBWSPACE_HPP

View File

@ -117,7 +117,7 @@ public:
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_PTHREAD )
#elif defined( KOKKOS_ENABLE_THREADS )
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
@ -265,7 +265,7 @@ public:
return (SharedAllocationRecord *) 0;
#endif
}
/**\brief Allocate tracked memory in the space */
static
@ -316,3 +316,4 @@ struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
} // namespace Kokkos
#endif // #define KOKKOS_HOSTSPACE_HPP

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -47,7 +47,7 @@
#ifndef KOKKOS_LAYOUT_HPP
#define KOKKOS_LAYOUT_HPP
#include <stddef.h>
#include <cstddef>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
@ -62,7 +62,7 @@ enum { ARRAY_LAYOUT_MAX_RANK = 8 };
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Layout left" indicates a mapping where the leftmost index i0
/// refers to contiguous access, and strides increase for dimensions
@ -95,7 +95,7 @@ struct LayoutLeft {
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Right layout" indicates a mapping where the rightmost index ik
/// refers to contiguous access, and strides increase for dimensions
@ -130,7 +130,7 @@ struct LayoutStride {
typedef LayoutStride array_layout ;
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
LayoutStride( LayoutStride const & ) = default ;
LayoutStride( LayoutStride && ) = default ;
@ -192,7 +192,7 @@ struct LayoutStride {
///
/// This is an example of a \c MemoryLayout template parameter of
/// View. The memory layout describes how View maps from a
/// multi-index (i0, i1, ..., ik) to a memory location.
/// multi-index (i0, i1, ..., ik) to a memory location.
///
/// "Tiled layout" indicates a mapping to contiguously stored
/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two

View File

@ -48,9 +48,10 @@
/** Pick up configure / build options via #define macros:
*
* KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces
* KOKKOS_ENABLE_PTHREAD Kokkos::Threads execution space
* KOKKOS_ENABLE_THREADS Kokkos::Threads execution space
* KOKKOS_ENABLE_QTHREADS Kokkos::Qthreads execution space
* KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space
* KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget execution space
* KOKKOS_ENABLE_HWLOC HWLOC library is available.
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive!
* KOKKOS_ENABLE_MPI Negotiate MPI/execution space interactions.
@ -414,6 +415,7 @@
// There is zero or one default execution space specified.
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
@ -424,15 +426,18 @@
// If default is not specified then chose from enabled execution spaces.
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
#elif defined( KOKKOS_ENABLE_OPENMP )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
#elif defined( KOKKOS_ENABLE_PTHREAD )
#elif defined( KOKKOS_ENABLE_THREADS )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
@ -459,10 +464,17 @@
#endif
//----------------------------------------------------------------------------
// Enable Profiling by default
// If compiling with CUDA then must be using CUDA 8 or better
// and use relocateable device code to enable the task policy.
// nvcc relocatable device code option: --relocatable-device-code=true
#ifndef KOKKOS_ENABLE_PROFILING
#define KOKKOS_ENABLE_PROFILING 1
#if ( defined( KOKKOS_ENABLE_CUDA ) )
#if ( 8000 <= CUDA_VERSION ) && defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_TASKDAG
#endif
#else
#define KOKKOS_ENABLE_TASKDAG
#endif
#endif // #ifndef KOKKOS_MACROS_HPP

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,217 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_NUMERICTRAITS_HPP
#define KOKKOS_NUMERICTRAITS_HPP
#include<climits>
#include<cfloat>
namespace Kokkos {
template<class T>
struct reduction_identity; /*{
KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); } // 0
KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod() // 1
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom prod reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T max() // minimum value
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom max reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T min() // maximum value
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom min reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor() // 0, only for integer type
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom bor reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T band() // !0, only for integer type
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom band reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor() // 0, only for integer type
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom lor reduction type"); return T(); }
KOKKOS_FORCEINLINE_FUNCTION constexpr static T land() // !0, only for integer type
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); }
};*/
template<>
struct reduction_identity<signed char> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum() {return static_cast<signed char>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() {return static_cast<signed char>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max() {return SCHAR_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min() {return SCHAR_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor() {return static_cast<signed char>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() {return ~static_cast<signed char>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor() {return static_cast<signed char>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() {return static_cast<signed char>(1);}
};
template<>
struct reduction_identity<short> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum() {return static_cast<short>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() {return static_cast<short>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short max() {return SHRT_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short min() {return SHRT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor() {return static_cast<short>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() {return ~static_cast<short>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor() {return static_cast<short>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() {return static_cast<short>(1);}
};
template<>
struct reduction_identity<int> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum() {return static_cast<int>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() {return static_cast<int>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int max() {return INT_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int min() {return INT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor() {return static_cast<int>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() {return ~static_cast<int>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor() {return static_cast<int>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() {return static_cast<int>(1);}
};
template<>
struct reduction_identity<long> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum() {return static_cast<long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() {return static_cast<long>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long max() {return LLONG_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long min() {return LLONG_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor() {return static_cast<long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() {return ~static_cast<long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor() {return static_cast<long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() {return static_cast<long>(1);}
};
template<>
struct reduction_identity<long long> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum() {return static_cast<long long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() {return static_cast<long long>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max() {return LLONG_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min() {return LLONG_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor() {return static_cast<long long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() {return ~static_cast<long long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor() {return static_cast<long long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() {return static_cast<long long>(1);}
};
template<>
struct reduction_identity<unsigned char> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum() {return static_cast<unsigned char>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() {return static_cast<unsigned char>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max() {return static_cast<unsigned char>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min() {return UCHAR_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor() {return static_cast<unsigned char>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() {return ~static_cast<unsigned char>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor() {return static_cast<unsigned char>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() {return static_cast<unsigned char>(1);}
};
template<>
struct reduction_identity<unsigned short> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum() {return static_cast<unsigned short>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() {return static_cast<unsigned short>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max() {return static_cast<unsigned short>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min() {return USHRT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor() {return static_cast<unsigned short>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() {return ~static_cast<unsigned short>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor() {return static_cast<unsigned short>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() {return static_cast<unsigned short>(1);}
};
template<>
struct reduction_identity<unsigned int> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum() {return static_cast<unsigned int>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() {return static_cast<unsigned int>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max() {return static_cast<unsigned int>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min() {return UINT_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor() {return static_cast<unsigned int>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() {return ~static_cast<unsigned int>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor() {return static_cast<unsigned int>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() {return static_cast<unsigned int>(1);}
};
template<>
struct reduction_identity<unsigned long> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum() {return static_cast<unsigned long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() {return static_cast<unsigned long>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max() {return static_cast<unsigned long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min() {return ULONG_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor() {return static_cast<unsigned long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() {return ~static_cast<unsigned long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor() {return static_cast<unsigned long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() {return static_cast<unsigned long>(1);}
};
template<>
struct reduction_identity<unsigned long long> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum() {return static_cast<unsigned long long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() {return static_cast<unsigned long long>(1);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max() {return static_cast<unsigned long long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min() {return ULLONG_MAX;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor() {return static_cast<unsigned long long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() {return ~static_cast<unsigned long long>(0x0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor() {return static_cast<unsigned long long>(0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() {return static_cast<unsigned long long>(1);}
};
template<>
struct reduction_identity<float> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
};
template<>
struct reduction_identity<double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
};
template<>
struct reduction_identity<long double> {
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
};
}
#endif

View File

@ -44,22 +44,23 @@
#ifndef KOKKOS_OPENMP_HPP
#define KOKKOS_OPENMP_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP)
#if defined( KOKKOS_ENABLE_OPENMP) && !defined(_OPENMP)
#if !defined(_OPENMP)
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
#endif
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP )
#include <omp.h>
#include <Kokkos_Core_fwd.hpp>
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#ifdef KOKKOS_ENABLE_HBWSPACE
#include <Kokkos_HBWSpace.hpp>
#endif
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskScheduler.hpp>
@ -79,26 +80,26 @@ public:
//@{
//! Tag this class as a kokkos execution space
typedef OpenMP execution_space ;
using execution_space = OpenMP;
#ifdef KOKKOS_ENABLE_HBWSPACE
typedef Experimental::HBWSpace memory_space ;
using memory_space = Experimental::HBWSpace;
#else
typedef HostSpace memory_space ;
using memory_space = HostSpace;
#endif
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
using device_type = Kokkos::Device<execution_space,memory_space>;
typedef LayoutRight array_layout ;
typedef memory_space::size_type size_type ;
using array_layout = LayoutRight;
using size_type = memory_space::size_type;
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
using scratch_memory_space = ScratchMemorySpace< OpenMP >;
//@}
//------------------------------------
//! \name Functions that all Kokkos execution spaces must implement.
//@{
inline static bool in_parallel() { return omp_in_parallel(); }
inline static bool in_parallel();
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
static bool sleep();
@ -153,6 +154,8 @@ public:
KOKKOS_INLINE_FUNCTION static
unsigned hardware_thread_id() { return thread_pool_rank(); }
static const char* name();
};
} // namespace Kokkos
@ -164,7 +167,7 @@ namespace Kokkos {
namespace Impl {
template<>
struct MemorySpaceAccess
struct MemorySpaceAccess
< Kokkos::OpenMP::memory_space
, Kokkos::OpenMP::scratch_memory_space
>
@ -191,7 +194,7 @@ struct VerifyExecutionCanAccessMemorySpace
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
@ -201,4 +204,3 @@ struct VerifyExecutionCanAccessMemorySpace
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
#endif /* #ifndef KOKKOS_OPENMP_HPP */

View File

@ -0,0 +1,186 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMPTARGET_HPP
#define KOKKOS_OPENMPTARGET_HPP
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP )
#include <omp.h>
#include <cstddef>
#include <iosfwd>
#include <Kokkos_OpenMPTargetSpace.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
/// \class OpenMPTarget
/// \brief Kokkos device for multicore processors in the host memory space.
class OpenMPTarget {
public:
//------------------------------------
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as a kokkos execution space
typedef OpenMPTarget execution_space ;
typedef OpenMPTargetSpace memory_space ;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef LayoutLeft array_layout ;
typedef memory_space::size_type size_type ;
typedef ScratchMemorySpace< OpenMPTarget > scratch_memory_space ;
//@}
//------------------------------------
//! \name Functions that all Kokkos execution spaces must implement.
//@{
inline static bool in_parallel() { return omp_in_parallel(); }
/** \brief Set the device in a "sleep" state. A noop for OpenMPTarget. */
static bool sleep();
/** \brief Wake the device from the 'sleep' state. A noop for OpenMPTarget. */
static bool wake();
/** \brief Wait until all dispatched functors complete. A noop for OpenMPTarget. */
static void fence() {}
/// \brief Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false );
/// \brief Free any resources being consumed by the device.
static void finalize();
/** \brief Initialize the device.
*
* 1) If the hardware locality library is enabled and OpenMPTarget has not
* already bound threads then bind OpenMPTarget threads to maximize
* core utilization and group for memory hierarchy locality.
*
* 2) Allocate a HostThread for each OpenMPTarget thread to hold its
* topology and fan in/out data.
*/
static void initialize( unsigned thread_count = 0 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 );
static int is_initialized();
/** \brief Return the maximum amount of concurrency. */
static int concurrency();
//@}
//------------------------------------
/** \brief This execution space has a topological thread pool which can be queried.
*
* All threads within a pool have a common memory space for which they are cache coherent.
* depth = 0 gives the number of threads in the whole pool.
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
*/
inline static int thread_pool_size( int depth = 0 );
/** \brief The rank of the executing thread in this thread pool */
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
//------------------------------------
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
KOKKOS_INLINE_FUNCTION static
unsigned hardware_thread_id() { return thread_pool_rank(); }
static const char* name();
private:
static bool m_is_initialized;
};
} // namespace Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Experimental::OpenMPTarget::memory_space
, Kokkos::Experimental::OpenMPTarget::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
/*--------------------------------------------------------------------------*/
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */
#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */

View File

@ -0,0 +1,265 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMPTARGETSPACE_HPP
#define KOKKOS_OPENMPTARGETSPACE_HPP
#include <cstring>
#include <string>
#include <iosfwd>
#include <typeinfo>
#include <Kokkos_Core_fwd.hpp>
#ifdef KOKKOS_ENABLE_OPENMPTARGET
#include <Kokkos_HostSpace.hpp>
#include <omp.h>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
//void init_lock_array_host_space();
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
//bool lock_address_host_space(void* ptr);
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
//void unlock_address_host_space(void* ptr);
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
/// \class OpenMPTargetSpace
/// \brief Memory management for host memory.
///
/// OpenMPTargetSpace is a memory space that governs host memory. "Host"
/// memory means the usual CPU-accessible memory.
class OpenMPTargetSpace {
public:
//! Tag this class as a kokkos memory space
typedef OpenMPTargetSpace memory_space ;
typedef size_t size_type ;
/// \typedef execution_space
/// \brief Default execution space for this memory space.
///
/// Every memory space has a default execution space. This is
/// useful for things like initializing a View (which happens in
/// parallel using the View's default execution space).
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
/**\brief Default memory space instance */
OpenMPTargetSpace();
OpenMPTargetSpace( OpenMPTargetSpace && rhs ) = default ;
OpenMPTargetSpace( const OpenMPTargetSpace & rhs ) = default ;
OpenMPTargetSpace & operator = ( OpenMPTargetSpace && ) = default ;
OpenMPTargetSpace & operator = ( const OpenMPTargetSpace & ) = default ;
~OpenMPTargetSpace() = default ;
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
private:
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > ;
};
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
friend Kokkos::Experimental::OpenMPTargetSpace ;
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
/**\brief Root record for tracked allocations from this OpenMPTargetSpace instance */
static RecordBase s_root_record ;
const Kokkos::Experimental::OpenMPTargetSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() = default ;
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
std::string get_label() const;
KOKKOS_INLINE_FUNCTION static
SharedAllocationRecord * allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::Experimental::OpenMPTargetSpace & , bool detail = false );
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//TODO: implement all possible deep_copies
template<class ExecutionSpace>
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
}
};
template<class ExecutionSpace>
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,HostSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
}
};
template<class ExecutionSpace>
struct DeepCopy<HostSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
}
};
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::OpenMPTargetSpace >
{
enum { value = false };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
#endif
#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */

File diff suppressed because it is too large Load Diff

View File

@ -44,9 +44,10 @@
#ifndef KOKKOS_QTHREADS_HPP
#define KOKKOS_QTHREADS_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#ifdef KOKKOS_ENABLE_QTHREADS
#include <Kokkos_Core_fwd.hpp>
// Defines to enable experimental Qthreads functionality.
#define QTHREAD_LOCAL_PRIORITY
@ -150,6 +151,8 @@ public:
int shepherd_size() const;
int shepherd_worker_size() const;
static const char* name();
};
} // namespace Kokkos
@ -194,5 +197,5 @@ struct VerifyExecutionCanAccessMemorySpace
//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
#endif // #define KOKKOS_ENABLE_QTHREADS
#endif // #define KOKKOS_QTHREADS_HPP

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,9 +44,9 @@
#ifndef KOKKOS_SCRATCHSPACE_HPP
#define KOKKOS_SCRATCHSPACE_HPP
#include <stdio.h>
#include <cstdio>
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <Kokkos_Concepts.hpp>
/*--------------------------------------------------------------------------*/
@ -57,7 +57,7 @@ namespace Kokkos {
*/
template< class ExecSpace >
class ScratchMemorySpace {
static_assert (Impl::is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
static_assert (is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
public:
// Alignment of memory chunks returned by 'get'
@ -161,6 +161,3 @@ public:
#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -47,6 +47,9 @@
#ifndef KOKKOS_SERIAL_HPP
#define KOKKOS_SERIAL_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_SERIAL )
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Parallel.hpp>
@ -63,7 +66,6 @@
#include <KokkosExp_MDRangePolicy.hpp>
#if defined( KOKKOS_ENABLE_SERIAL )
namespace Kokkos {
@ -163,6 +165,7 @@ public:
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
static const char* name();
//--------------------------------------------------------------------------
};
@ -515,7 +518,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
@ -798,7 +801,7 @@ public:
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
, m_shared( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
@ -821,5 +824,3 @@ public:
#endif // defined( KOKKOS_ENABLE_SERIAL )
#endif /* #define KOKKOS_SERIAL_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -46,25 +46,10 @@
//----------------------------------------------------------------------------
#include <Kokkos_Core_fwd.hpp>
// If compiling with CUDA then must be using CUDA 8 or better
// and use relocateable device code to enable the task policy.
// nvcc relocatable device code option: --relocatable-device-code=true
#if ( defined( KOKKOS_ENABLE_CUDA ) )
#if ( 8000 <= CUDA_VERSION ) && \
defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_TASKDAG
#endif
#else
#define KOKKOS_ENABLE_TASKDAG
#endif
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
#include <Kokkos_Core_fwd.hpp>
//----------------------------------------------------------------------------
#include <Kokkos_MemoryPool.hpp>
@ -371,7 +356,7 @@ struct TaskPolicyData
TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicyData( DepFutureType && arg_future
TaskPolicyData( DepFutureType const & arg_future
, Kokkos::TaskPriority const & arg_priority )
: m_scheduler( 0 )
, m_dependence( arg_future )
@ -385,6 +370,15 @@ struct TaskPolicyData
, m_dependence()
, m_priority( static_cast<int>( arg_priority ) )
{}
KOKKOS_INLINE_FUNCTION
TaskPolicyData( scheduler_type const & arg_scheduler
, DepFutureType const & arg_future
, Kokkos::TaskPriority const & arg_priority )
: m_scheduler( & arg_scheduler )
, m_dependence( arg_future )
, m_priority( static_cast<int>( arg_priority ) )
{}
};
} // namespace Impl
@ -413,6 +407,7 @@ public:
using execution_space = ExecSpace ;
using memory_space = typename queue_type::memory_space ;
using memory_pool = typename queue_type::memory_pool ;
using member_type =
typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
@ -431,9 +426,7 @@ public:
KOKKOS_INLINE_FUNCTION
TaskScheduler & operator = ( TaskScheduler const & rhs ) = default ;
TaskScheduler( memory_space const & arg_memory_space
, unsigned const arg_memory_pool_capacity
, unsigned const arg_memory_pool_log2_superblock = 12 )
TaskScheduler( memory_pool const & arg_memory_pool )
: m_track()
, m_queue(0)
{
@ -442,21 +435,37 @@ public:
record_type ;
record_type * record =
record_type::allocate( arg_memory_space
record_type::allocate( memory_space()
, "TaskQueue"
, sizeof(queue_type)
);
m_queue = new( record->data() )
queue_type( arg_memory_space
, arg_memory_pool_capacity
, arg_memory_pool_log2_superblock );
m_queue = new( record->data() ) queue_type( arg_memory_pool );
record->m_destroy.m_queue = m_queue ;
m_track.assign_allocated_record_to_uninitialized( record );
}
TaskScheduler( memory_space const & arg_memory_space
, size_t const mempool_capacity
, unsigned const mempool_min_block_size // = 1u << 6
, unsigned const mempool_max_block_size // = 1u << 10
, unsigned const mempool_superblock_size // = 1u << 12
)
: TaskScheduler( memory_pool( arg_memory_space
, mempool_capacity
, mempool_min_block_size
, mempool_max_block_size
, mempool_superblock_size ) )
{}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
memory_pool * memory() const noexcept
{ return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
//----------------------------------------
/**\brief Allocation size for a spawned task */
template< typename FunctorType >
@ -502,7 +511,12 @@ public:
: (queue_type*) 0 );
if ( 0 == queue ) {
Kokkos::abort("Kokkos spawn given null Future" );
Kokkos::abort("Kokkos spawn requires scheduler or non-null Future");
}
if ( arg_policy.m_dependence.m_task != 0 &&
arg_policy.m_dependence.m_task->m_queue != queue ) {
Kokkos::abort("Kokkos spawn given incompatible scheduler and Future");
}
//----------------------------------------
@ -641,7 +655,7 @@ public:
KOKKOS_INLINE_FUNCTION
int allocation_capacity() const noexcept
{ return m_queue->m_memory.get_mem_size(); }
{ return m_queue->m_memory.capacity(); }
KOKKOS_INLINE_FUNCTION
int allocated_task_count() const noexcept
@ -696,6 +710,22 @@ TaskTeam( T const & arg
>( arg , arg_priority );
}
template< typename E , typename F >
Kokkos::Impl::
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
KOKKOS_INLINE_FUNCTION
TaskTeam( TaskScheduler<E> const & arg_scheduler
, F const & arg_future
, typename std::enable_if< Kokkos::is_future<F>::value ,
TaskPriority >::type const & arg_priority = TaskPriority::Regular
)
{
return
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
( arg_scheduler , arg_future , arg_priority );
}
// Construct a TaskSingle execution policy
template< typename T >
@ -721,6 +751,22 @@ TaskSingle( T const & arg
>( arg , arg_priority );
}
template< typename E , typename F >
Kokkos::Impl::
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
KOKKOS_INLINE_FUNCTION
TaskSingle( TaskScheduler<E> const & arg_scheduler
, F const & arg_future
, typename std::enable_if< Kokkos::is_future<F>::value ,
TaskPriority >::type const & arg_priority = TaskPriority::Regular
)
{
return
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
( arg_scheduler , arg_future , arg_priority );
}
//----------------------------------------------------------------------------
/**\brief A host control thread spawns a task with options
@ -849,3 +895,4 @@ void wait( TaskScheduler< ExecSpace > const & scheduler )
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */

View File

@ -44,9 +44,10 @@
#ifndef KOKKOS_THREADS_HPP
#define KOKKOS_THREADS_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#if defined( KOKKOS_ENABLE_PTHREAD )
#include <Kokkos_Core_fwd.hpp>
#include <cstddef>
#include <iosfwd>
@ -178,6 +179,7 @@ public:
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
static const char* name();
//@}
//----------------------------------------
};
@ -190,7 +192,7 @@ namespace Kokkos {
namespace Impl {
template<>
struct MemorySpaceAccess
struct MemorySpaceAccess
< Kokkos::Threads::memory_space
, Kokkos::Threads::scratch_memory_space
>
@ -227,6 +229,6 @@ struct VerifyExecutionCanAccessMemorySpace
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
#endif /* #define KOKKOS_THREADS_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,7 +44,7 @@
#ifndef KOKKOS_TIMER_HPP
#define KOKKOS_TIMER_HPP
#include <stddef.h>
#include <cstddef>
#ifdef _MSC_VER
#undef KOKKOS_ENABLE_LIBRT
@ -110,3 +110,4 @@ public:
} // namespace Kokkos
#endif /* #ifndef KOKKOS_TIMER_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -51,3 +51,4 @@
#endif
#endif

View File

@ -625,7 +625,7 @@ private:
#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
Kokkos::Impl::view_verify_operator_bounds ARG ;
Kokkos::Impl::view_verify_operator_bounds< typename traits::memory_space > ARG ;
#else
@ -646,12 +646,7 @@ public:
), reference_type >::type
operator()( Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
return m_map.reference();
}
@ -669,12 +664,7 @@ public:
operator()( const I0 & i0
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
return m_map.reference(i0);
}
@ -690,13 +680,7 @@ public:
operator()( const I0 & i0
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
return m_map.m_handle[ i0 ];
}
@ -712,12 +696,7 @@ public:
operator()( const I0 & i0
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
}
@ -733,12 +712,7 @@ public:
), reference_type >::type
operator[]( const I0 & i0 ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
return m_map.reference(i0);
}
@ -752,12 +726,7 @@ public:
), reference_type >::type
operator[]( const I0 & i0 ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
return m_map.m_handle[ i0 ];
}
@ -771,12 +740,7 @@ public:
), reference_type >::type
operator[]( const I0 & i0 ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
}
@ -794,12 +758,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.reference(i0,i1);
}
@ -815,12 +774,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
}
@ -836,12 +790,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
}
@ -857,12 +806,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
}
@ -878,12 +822,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
}
@ -899,12 +838,7 @@ public:
operator()( const I0 & i0 , const I1 & i1
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
i1 * m_map.m_offset.m_stride.S1 ];
}
@ -923,12 +857,7 @@ public:
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
}
@ -943,12 +872,7 @@ public:
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
return m_map.reference(i0,i1,i2);
}
@ -966,12 +890,7 @@ public:
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
}
@ -986,12 +905,7 @@ public:
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
return m_map.reference(i0,i1,i2,i3);
}
@ -1011,12 +925,7 @@ public:
, const I4 & i4
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
}
@ -1033,12 +942,7 @@ public:
, const I4 & i4
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
return m_map.reference(i0,i1,i2,i3,i4);
}
@ -1058,12 +962,7 @@ public:
, const I4 & i4 , const I5 & i5
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
}
@ -1080,12 +979,7 @@ public:
, const I4 & i4 , const I5 & i5
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
return m_map.reference(i0,i1,i2,i3,i4,i5);
}
@ -1105,12 +999,7 @@ public:
, const I4 & i4 , const I5 & i5 , const I6 & i6
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
}
@ -1127,12 +1016,7 @@ public:
, const I4 & i4 , const I5 & i5 , const I6 & i6
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
}
@ -1152,12 +1036,7 @@ public:
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
}
@ -1174,12 +1053,7 @@ public:
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
, Args ... args ) const
{
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
#else
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
#endif
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
}

View File

@ -9,14 +9,14 @@ default: messages build-lib
echo "End Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
else
CXX = g++
endif
CXXFLAGS = -O3
LINK ?= $(CXX)
LDFLAGS ?=
LDFLAGS ?=
include $(KOKKOS_PATH)/Makefile.kokkos
@ -66,7 +66,7 @@ else
KOKKOS_DEBUG_CMAKE = ON
endif
messages:
messages:
echo "Start Build"
build-makefile-kokkos:
@ -80,13 +80,13 @@ build-makefile-kokkos:
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
echo "CXX ?= $(CXX)" >> Makefile.kokkos
echo "CXX ?= $(CXX)" >> Makefile.kokkos
echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Variables used in application Makefiles" >> Makefile.kokkos
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
@ -133,11 +133,11 @@ build-cmake-kokkos:
echo "else()" >> kokkos.cmake
echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
echo "endif()" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "#Variables used in application Makefiles" >> kokkos.cmake
echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
@ -164,7 +164,7 @@ build-cmake-kokkos:
build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
mkdir:
mkdir:
mkdir -p $(PREFIX)
mkdir -p $(PREFIX)/bin
mkdir -p $(PREFIX)/include
@ -187,7 +187,7 @@ copy-openmp: mkdir
mkdir -p $(PREFIX)/include/OpenMP
cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
install: mkdir $(CONDITIONAL_COPIES) build-lib
install: mkdir $(CONDITIONAL_COPIES) build-lib
cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl

View File

@ -41,7 +41,10 @@
//@HEADER
*/
#include <stdio.h>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP )
#include <cstdio>
#include <limits>
#include <iostream>
#include <vector>
@ -51,7 +54,6 @@
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#ifdef KOKKOS_ENABLE_OPENMP
namespace Kokkos {
namespace Impl {
@ -82,13 +84,13 @@ bool s_using_hwloc = false;
namespace Kokkos {
namespace Impl {
int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
void OpenMPexec::verify_is_process( const char * const label )
void OpenMPExec::verify_is_process( const char * const label )
{
if ( omp_in_parallel() ) {
std::string msg( label );
@ -97,7 +99,7 @@ void OpenMPexec::verify_is_process( const char * const label )
}
}
void OpenMPexec::verify_initialized( const char * const label )
void OpenMPExec::verify_initialized( const char * const label )
{
if ( 0 == m_pool[0] ) {
std::string msg( label );
@ -122,7 +124,7 @@ void OpenMPexec::verify_initialized( const char * const label )
namespace Kokkos {
namespace Impl {
void OpenMPexec::clear_thread_data()
void OpenMPExec::clear_thread_data()
{
const size_t member_bytes =
sizeof(int64_t) *
@ -149,7 +151,7 @@ void OpenMPexec::clear_thread_data()
/* END #pragma omp parallel */
}
void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes )
@ -231,7 +233,7 @@ namespace Kokkos {
//----------------------------------------------------------------------------
int OpenMP::is_initialized()
{ return 0 != Impl::OpenMPexec::m_pool[0]; }
{ return 0 != Impl::OpenMPExec::m_pool[0]; }
void OpenMP::initialize( unsigned thread_count ,
unsigned use_numa_count ,
@ -257,7 +259,7 @@ void OpenMP::initialize( unsigned thread_count ,
static int omp_max_threads = nthreads;
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
bool thread_spawn_failed = false ;
@ -270,7 +272,7 @@ void OpenMP::initialize( unsigned thread_count ,
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
// If hwloc available then use it's maximum value.
@ -316,16 +318,16 @@ void OpenMP::initialize( unsigned thread_count ,
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
: omp_rank ;
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
}
/* END #pragma omp critical */
}
/* END #pragma omp parallel */
if ( ! thread_spawn_failed ) {
Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
// New, unified host thread team data:
{
@ -334,7 +336,7 @@ void OpenMP::initialize( unsigned thread_count ,
size_t team_shared_bytes = 1024 * thread_count ;
size_t thread_local_bytes = 1024 ;
Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes
@ -371,15 +373,15 @@ void OpenMP::initialize( unsigned thread_count ,
void OpenMP::finalize()
{
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
// New, unified host thread team data:
Impl::OpenMPexec::clear_thread_data();
Impl::OpenMPExec::clear_thread_data();
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
Impl::OpenMPexec::m_pool_topo[2] = 0 ;
Impl::OpenMPExec::m_pool_topo[0] = 0 ;
Impl::OpenMPExec::m_pool_topo[1] = 0 ;
Impl::OpenMPExec::m_pool_topo[2] = 0 ;
omp_set_num_threads(1);
@ -396,7 +398,7 @@ void OpenMP::finalize()
void OpenMP::print_configuration( std::ostream & s , const bool detail )
{
Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
s << "Kokkos::OpenMP" ;
@ -414,12 +416,12 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
;
#endif
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
if ( is_initialized ) {
const int numa_count = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
const int core_per_numa = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
s << " thread_pool_topology[ " << numa_count
<< " x " << core_per_numa
@ -428,7 +430,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
<< std::endl ;
if ( detail ) {
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
#pragma omp parallel
{
@ -442,7 +444,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
s << " thread omp_rank[" << i << "]"
<< " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
<< " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
<< std::endl ;
}
@ -457,6 +459,11 @@ int OpenMP::concurrency() {
return thread_pool_size(0);
}
const char* OpenMP::name() { return "OpenMP"; }
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
#endif //KOKKOS_ENABLE_OPENMP

View File

@ -44,6 +44,9 @@
#ifndef KOKKOS_OPENMPEXEC_HPP
#define KOKKOS_OPENMPEXEC_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP )
#include <Kokkos_OpenMP.hpp>
#include <impl/Kokkos_Traits.hpp>
@ -66,7 +69,7 @@ namespace Impl {
//----------------------------------------------------------------------------
/** \brief Data for OpenMP thread execution */
class OpenMPexec {
class OpenMPExec {
public:
friend class Kokkos::OpenMP ;
@ -324,17 +327,21 @@ public:
namespace Kokkos {
inline
bool OpenMP::in_parallel()
{ return omp_in_parallel(); }
inline
int OpenMP::thread_pool_size( int depth )
{
return Impl::OpenMPexec::pool_size(depth);
return Impl::OpenMPExec::pool_size(depth);
}
KOKKOS_INLINE_FUNCTION
int OpenMP::thread_pool_rank()
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
#else
return -1 ;
#endif
@ -342,4 +349,6 @@ int OpenMP::thread_pool_rank()
} // namespace Kokkos
#endif
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,9 +44,12 @@
#ifndef KOKKOS_OPENMP_PARALLEL_HPP
#define KOKKOS_OPENMP_PARALLEL_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP )
#include <omp.h>
#include <iostream>
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
@ -58,7 +61,7 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::OpenMP
, Kokkos::OpenMP
>
{
private:
@ -77,7 +80,7 @@ private:
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
@ -94,7 +97,7 @@ private:
, const Member ibeg , const Member iend )
{
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
@ -111,12 +114,12 @@ public:
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
#pragma omp parallel
{
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
@ -200,11 +203,6 @@ private:
, const Member ibeg , const Member iend
, reference_type update )
{
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork , update );
}
@ -218,11 +216,6 @@ private:
, reference_type update )
{
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork , update );
}
@ -235,13 +228,13 @@ public:
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
const size_t pool_reduce_bytes =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
OpenMPexec::resize_thread_data( pool_reduce_bytes
OpenMPExec::resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
@ -249,7 +242,7 @@ public:
#pragma omp parallel
{
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
@ -282,12 +275,12 @@ public:
// Reduction:
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -305,7 +298,7 @@ public:
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ViewType & arg_result_view
, const ViewType & arg_view
, typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
@ -313,7 +306,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
, m_result_ptr( arg_view.data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
@ -327,7 +320,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
@ -378,11 +371,6 @@ private:
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork , update , final );
}
@ -396,11 +384,6 @@ private:
, reference_type update , const bool final )
{
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork , update , final );
}
@ -411,13 +394,13 @@ public:
inline
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
const int value_count = Analysis::value_count( m_functor );
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
OpenMPexec::resize_thread_data( pool_reduce_bytes
OpenMPExec::resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
@ -425,7 +408,7 @@ public:
#pragma omp parallel
{
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
@ -565,22 +548,22 @@ public:
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
const size_t pool_reduce_size = 0 ; // Never shrinks
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPexec::resize_thread_data( pool_reduce_size
OpenMPExec::resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
{
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
const int active = data.organize_team( m_policy.team_size() );
@ -723,8 +706,8 @@ public:
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
@ -733,14 +716,14 @@ public:
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPexec::resize_thread_data( pool_reduce_size
OpenMPExec::resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
{
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
const int active = data.organize_team( m_policy.team_size() );
@ -785,12 +768,12 @@ public:
// Reduction:
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
@ -830,7 +813,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
, m_shmem_size( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >
@ -849,5 +832,6 @@ public:
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif
#endif /* KOKKOS_OPENMP_PARALLEL_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,15 +36,16 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_TaskQueue_impl.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
@ -110,21 +111,27 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
static task_root_type * const end =
(task_root_type *) task_root_type::EndTag ;
HostThreadTeamData & team_data_single =
HostThreadTeamDataSingleton::singleton();
const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
// const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
// const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
#if 0
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
fflush(stdout);
#endif
OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
, 512 * team_size /* team reduce buffer */
, 0 /* team shared buffer */
, 0 /* thread local buffer */
);
#pragma omp parallel
{
Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
@ -164,7 +171,7 @@ fflush(stdout);
if ( 0 != task && end != task ) {
// team member #0 completes the previously executed task,
// completion may delete the task
queue->complete( task );
queue->complete( task );
}
// If 0 == m_ready_count then set task = 0
@ -301,7 +308,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
(*task->m_apply)( task , & single_exec );
queue->complete( task );
queue->complete( task );
} while(1);
}
@ -310,7 +317,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#else
void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {}
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,6 +44,7 @@
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
#define KOKKOS_IMPL_OPENMP_TASK_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------

View File

@ -0,0 +1,306 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <algorithm>
#include <omp.h>
#include <Kokkos_Macros.hpp>
/*--------------------------------------------------------------------------*/
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
#include <iostream>
#include <sstream>
#include <cstring>
#include <Kokkos_OpenMPTargetSpace.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Kokkos_Atomic.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
/* Default allocation mechanism */
OpenMPTargetSpace::OpenMPTargetSpace()
{}
void * OpenMPTargetSpace::allocate( const size_t arg_alloc_size ) const
{
static_assert( sizeof(void*) == sizeof(uintptr_t)
, "Error sizeof(void*) != sizeof(uintptr_t)" );
void * ptr;
ptr = omp_target_alloc( arg_alloc_size, omp_get_default_device());
return ptr;
}
void OpenMPTargetSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
{
if ( arg_alloc_ptr ) {
omp_target_free( arg_alloc_ptr , omp_get_default_device() );
}
}
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
SharedAllocationRecord< void , void >
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record ;
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
~SharedAllocationRecord()
{
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
, SharedAllocationRecord< void , void >::m_alloc_size
);
}
//TODO: Implement deep copy back see CudaSpace
std::string
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_label() const {
return std::string("OpenMPTargetAllocation");
}
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >*
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space,
const std::string & arg_label ,
const size_t arg_alloc_size) {
return new SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >(arg_space,arg_label,arg_alloc_size);
}
void
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
deallocate( SharedAllocationRecord<void, void> * arg_rec )
{
delete static_cast<SharedAllocationRecord*>(arg_rec);
}
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
)
// Pass through allocated [ SharedAllocationHeader , user_memory ]
// Pass through deallocation function
: SharedAllocationRecord< void , void >
( & SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
, sizeof(SharedAllocationHeader) + arg_alloc_size
, arg_dealloc
)
, m_space( arg_space )
{
SharedAllocationHeader header;
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
strncpy( header.m_label
, arg_label.c_str()
, SharedAllocationHeader::maximum_label_length
);
//TODO DeepCopy
// DeepCopy
}
//----------------------------------------------------------------------------
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
//Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() , r_old->data()
// , std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > *
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record( void * alloc_ptr )
{
typedef SharedAllocationHeader Header ;
typedef SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > RecordHost ;
SharedAllocationHeader const * const head = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
RecordHost * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR" ) );
}
return record ;
}
// Iterate records to print orphaned memory ...
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
print_records( std::ostream & s , const Kokkos::Experimental::OpenMPTargetSpace & space , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "OpenMPTargetSpace" , & s_root_record , detail );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template< class >
struct ViewOperatorBoundsErrorAbort ;
template<>
struct ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace > {
static void apply( const size_t rank
, const size_t n0 , const size_t n1
, const size_t n2 , const size_t n3
, const size_t n4 , const size_t n5
, const size_t n6 , const size_t n7
, const size_t i0 , const size_t i1
, const size_t i2 , const size_t i3
, const size_t i4 , const size_t i5
, const size_t i6 , const size_t i7 );
};
void ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace >::
apply( const size_t rank
, const size_t n0 , const size_t n1
, const size_t n2 , const size_t n3
, const size_t n4 , const size_t n5
, const size_t n6 , const size_t n7
, const size_t i0 , const size_t i1
, const size_t i2 , const size_t i3
, const size_t i4 , const size_t i5
, const size_t i6 , const size_t i7 )
{
printf( "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
, rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
//Kokkos::Impl::throw_runtime_exception( buffer );
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/*
namespace Kokkos {
namespace {
const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
}
namespace Impl {
void init_lock_array_host_space() {
static int is_initialized = 0;
if(! is_initialized)
for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
HOST_SPACE_ATOMIC_LOCKS[i] = 0;
}
bool lock_address_host_space(void* ptr) {
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0 , 1);
}
void unlock_address_host_space(void* ptr) {
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0);
}
}
}*/

View File

@ -0,0 +1,273 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <stdio.h>
#include <limits>
#include <iostream>
#include <vector>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
#include <iostream>
#include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#ifdef KOKKOS_ENABLE_OPENMPTARGET
namespace Kokkos {
namespace Impl {
namespace {
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel();
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
KOKKOS_INLINE_FUNCTION
int kokkos_omp_in_parallel()
{
#ifndef __CUDA_ARCH__
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
#else
return 0;
#endif
}
bool s_using_hwloc = false;
} // namespace
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
bool OpenMPTarget::m_is_initialized = false;
}
}
namespace Kokkos {
namespace Impl {
//int OpenMPTargetExec::m_map_rank[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
//int OpenMPTargetExec::m_pool_topo[ 4 ] = { 0 };
//OpenMPTargetExec * OpenMPTargetExec::m_pool[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
void OpenMPTargetExec::verify_is_process( const char * const label )
{
if ( omp_in_parallel() ) {
std::string msg( label );
msg.append( " ERROR: in parallel" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void OpenMPTargetExec::verify_initialized( const char * const label )
{
if ( 0 == Kokkos::Experimental::OpenMPTarget::is_initialized() ) {
std::string msg( label );
msg.append( " ERROR: not initialized" );
Kokkos::Impl::throw_runtime_exception( msg );
}
if ( omp_get_max_threads() != Kokkos::Experimental::OpenMPTarget::thread_pool_size(0) ) {
std::string msg( label );
msg.append( " ERROR: Initialized but threads modified inappropriately" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void* OpenMPTargetExec::m_scratch_ptr = NULL;
int64_t OpenMPTargetExec::m_scratch_size = 0;
void OpenMPTargetExec::clear_scratch()
{
Kokkos::Experimental::OpenMPTargetSpace space;
space.deallocate(m_scratch_ptr,m_scratch_size);
m_scratch_ptr = NULL;
m_scratch_size = NULL;
}
void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
void OpenMPTargetExec::resize_scratch( int64_t reduce_bytes ,
int64_t team_reduce_bytes,
int64_t team_shared_bytes, int64_t thread_local_bytes)
{
Kokkos::Experimental::OpenMPTargetSpace space;
uint64_t total_size = MAX_ACTIVE_TEAMS * reduce_bytes + // Inter Team Reduction
MAX_ACTIVE_TEAMS * team_reduce_bytes + // Intra Team Reduction
MAX_ACTIVE_TEAMS * team_shared_bytes + // Team Local Scratch
MAX_ACTIVE_THREADS * thread_local_bytes; // Thread Private Scratch
if( total_size > m_scratch_size ) {
space.deallocate(m_scratch_ptr,m_scratch_size);
m_scratch_size = total_size;
m_scratch_ptr = space.allocate(total_size);
}
}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
//----------------------------------------------------------------------------
int OpenMPTarget::is_initialized()
{ return m_is_initialized; }// != Impl::OpenMPTargetExec::m_pool[0]; }
void OpenMPTarget::initialize( unsigned thread_count ,
unsigned use_numa_count ,
unsigned use_cores_per_numa )
{
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
// Init the array for used for arbitrarily sized atomics
Kokkos::Impl::init_lock_array_host_space();
#ifdef KOKKOS_ENABLE_PROFILING
Kokkos::Profiling::initialize();
#endif
m_is_initialized = true;
}
//----------------------------------------------------------------------------
void OpenMPTarget::finalize()
{
Kokkos::Impl::OpenMPTargetExec::verify_initialized( "OpenMPTarget::finalize" );
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::finalize" );
m_is_initialized = false;
omp_set_num_threads(1);
if ( Kokkos::Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
hwloc::unbind_this_thread();
}
#ifdef KOKKOS_ENABLE_PROFILING
Kokkos::Profiling::finalize();
#endif
}
//----------------------------------------------------------------------------
void OpenMPTarget::print_configuration( std::ostream & s , const bool detail )
{
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::print_configuration" );
/*
s << "Kokkos::Experimental::OpenMPTarget" ;
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
s << " KOKKOS_ENABLE_OPENMPTARGET" ;
#endif
#if defined( KOKKOS_HAVE_HWLOC )
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
;
#endif
const bool is_initialized = 0 != Impl::OpenMPTargetExec::m_pool[0] ;
if ( is_initialized ) {
const int numa_count = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] ;
const int core_per_numa = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
const int thread_per_core = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
s << " thread_pool_topology[ " << numa_count
<< " x " << core_per_numa
<< " x " << thread_per_core
<< " ]"
<< std::endl ;
if ( detail ) {
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] );
#pragma omp parallel
{
#pragma omp critical
{
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
}
// END #pragma omp critical
}
// END #pragma omp parallel
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
s << " thread omp_rank[" << i << "]"
<< " kokkos_rank[" << Impl::OpenMPTargetExec::m_map_rank[ i ] << "]"
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
<< std::endl ;
}
}
}
else {
s << " not initialized" << std::endl ;
}
*/
}
int OpenMPTarget::concurrency() {
return thread_pool_size(0);
}
const char* OpenMPTarget::name() { return "OpenMPTarget"; }
} // namespace Experimental
} // namespace Kokkos
#endif //KOKKOS_ENABLE_OPENMPTARGET

View File

@ -0,0 +1,727 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
#define KOKKOS_OPENMPTARGETEXEC_HPP
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <Kokkos_Atomic.hpp>
#include <iostream>
#include <sstream>
#include <fstream>
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
/** \brief Data for OpenMPTarget thread execution */
class OpenMPTargetExec {
public:
enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
private:
static void* scratch_ptr;
public:
static void verify_is_process( const char * const );
static void verify_initialized( const char * const );
static void* get_scratch_ptr();
static void clear_scratch();
static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
static void* m_scratch_ptr;
static int64_t m_scratch_size;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class OpenMPTargetExecTeamMember {
public:
enum { TEAM_REDUCE_SIZE = 512 };
/** \brief Thread states for team synchronization */
enum { Active = 0 , Rendezvous = 1 };
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
scratch_memory_space m_team_shared ;
int m_team_scratch_size[2] ;
int m_team_rank ;
int m_team_size ;
int m_league_rank ;
int m_league_size ;
int m_vector_length ;
int m_vector_lane ;
void* m_glb_scratch ;
/*
// Fan-in team threads, root of the fan-in which does not block returns true
inline
bool team_fan_in() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
}
if ( m_team_rank_rev ) {
m_exec.state_set( Rendezvous );
memory_fence();
m_exec.state_wait( Rendezvous );
}
return 0 == m_team_rank_rev ;
}
inline
void team_fan_out() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
memory_fence();
}
}
*/
public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_shmem() const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
{
#pragma omp barrier
}
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType& value, const int& thread_id) const
{
/*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ }
#else
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
, ValueType , void >::type type ;
type * const local_value = ((type*) m_exec.scratch_thread());
if(team_rank() == thread_id)
*local_value = value;
memory_fence();
team_barrier();
value = *local_value;
#endif*/
}
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION ValueType
team_reduce( const ValueType & value
, const JoinOp & op_in ) const {
#pragma omp barrier
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
for(int i = m_team_rank; i < n_values; i+= m_team_size) {
team_scratch[i] = value_type();
}
#pragma omp barrier
for(int k=0; k<m_team_size; k+=n_values) {
if((k <= m_team_rank) && (k+n_values > m_team_rank))
team_scratch[m_team_rank%n_values]+=value;
#pragma omp barrier
}
for(int d = 1; d<n_values;d*=2) {
if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
}
#pragma omp barrier
}
return team_scratch[0];
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename ArgType >
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
{
/* // Make sure there is enough scratch space:
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
volatile type * const work_value = ((type*) m_exec.scratch_thread());
*work_value = value ;
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
// m_team_base[0] == highest ranking team member
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
//
// 1) copy from lower to higher rank, initialize lowest rank to zero
// 2) prefix sum from lowest to highest rank, skipping lowest rank
type accum = 0 ;
if ( global_accum ) {
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
accum += val ;
}
accum = atomic_fetch_add( global_accum , accum );
}
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
const type offset = accum ;
accum += val ;
val = offset ;
}
memory_fence();
}
team_fan_out();
return *work_value ;*/
return ArgType();
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
{ return this-> template team_scan<Type>( value , 0 ); }
//----------------------------------------
// Private for the driver
private:
typedef execution_space::scratch_memory_space space ;
public:
inline
OpenMPTargetExecTeamMember( const int league_rank, const int league_size, const int team_size, const int vector_length //const TeamPolicyInternal< OpenMPTarget, Properties ...> & team
, void* const glb_scratch
, const int shmem_size_L1
, const int shmem_size_L2
)
: m_team_shared(0,0)
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
, m_team_rank(0)
, m_vector_length( vector_length )
, m_team_size( team_size )
, m_league_rank( league_rank )
, m_league_size( league_size )
, m_glb_scratch( glb_scratch )
{
const int omp_tid = omp_get_thread_num();
m_league_rank = league_rank;
m_team_rank = omp_tid/m_vector_length;
m_vector_lane = omp_tid%m_vector_length;
}
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
};
template< class ... Properties >
class TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >: public PolicyTraits<Properties ...>
{
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicyInternal execution_policy ;
typedef PolicyTraits<Properties ... > traits;
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size;
m_team_size = p.m_team_size;
m_vector_length = p.m_vector_length;
m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter;
m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size;
return *this;
}
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return 1024; }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & )
{ return 256; }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType &, const int& vector_length)
{ return 256/vector_length; }
//----------------------------------------
private:
int m_league_size ;
int m_team_size ;
int m_vector_length;
int m_team_alloc ;
int m_team_iter ;
size_t m_team_scratch_size[2];
size_t m_thread_scratch_size[2];
int m_chunk_size;
inline void init( const int league_size_request
, const int team_size_request
, const int vector_length_request )
{
m_league_size = league_size_request ;
m_team_size = team_size_request;
m_vector_length = vector_length_request;
set_auto_chunk_size();
}
public:
inline int vector_length() const { return m_vector_length ; }
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
if(team_size_ < 0)
team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
/** \brief Specify league size, request team size */
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, int team_size_request
, int vector_length_request = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request , vector_length_request); }
TeamPolicyInternal( typename traits::execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int vector_length_request = 1)
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
TeamPolicyInternal( int league_size_request
, int team_size_request
, int vector_length_request = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , team_size_request , vector_length_request); }
TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int vector_length_request = 1 )
: m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0)
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
inline int team_alloc() const { return m_team_alloc ; }
inline int team_iter() const { return m_team_iter ; }
inline int chunk_size() const { return m_chunk_size ; }
/** \brief set chunk_size to a discrete value*/
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
TeamPolicyInternal p = *this;
p.m_chunk_size = chunk_size_;
return p;
}
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
TeamPolicyInternal p = *this;
p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size[level] = per_thread.value;
return p;
};
private:
/** \brief finalize chunk_size if it was set to AUTO*/
inline void set_auto_chunk_size() {
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
if( concurrency==0 ) concurrency=1;
if(m_chunk_size > 0) {
if(!Impl::is_integral_power_of_two( m_chunk_size ))
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
}
int new_chunk_size = 1;
while(new_chunk_size*100*concurrency < m_league_size)
new_chunk_size *= 2;
if(new_chunk_size < 128) {
new_chunk_size = 1;
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
new_chunk_size*=2;
}
m_chunk_size = new_chunk_size;
}
public:
typedef Impl::OpenMPTargetExecTeamMember member_type ;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
inline
int OpenMPTarget::thread_pool_size( int depth )
{
//return Impl::OpenMPTargetExec::pool_size(depth);
return omp_get_max_threads();
}
KOKKOS_INLINE_FUNCTION
int OpenMPTarget::thread_pool_rank()
{
return omp_get_thread_num();
}
} // namespace Experimental
} // namespace Kokkos
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(const Impl::OpenMPTargetExecTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(const Impl::OpenMPTargetExecTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
//result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
//init_result = loop_boundaries.thread.team_reduce(result,join);
}
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
}
}
#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */

View File

@ -0,0 +1,767 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
#define KOKKOS_OPENMPTARGET_PARALLEL_HPP
#include <omp.h>
#include <iostream>
#include <Kokkos_Parallel.hpp>
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Experimental::OpenMPTarget
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
public:
inline void execute() const {
execute_impl<WorkTag>();
}
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
execute_impl() const
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename Policy::member_type begin = m_policy.begin();
const typename Policy::member_type end = m_policy.end();
#pragma omp target teams distribute parallel for map(to:this->m_functor)
for(int i=begin; i<end; i++)
m_functor(i);
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
execute_impl() const
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename Policy::member_type begin = m_policy.begin();
const typename Policy::member_type end = m_policy.end();
#pragma omp target teams distribute parallel for num_threads(128) map(to:this->m_functor)
for(int i=begin; i<end; i++)
m_functor(TagType(),i);
}
inline
ParallelFor( const FunctorType & arg_functor
, Policy arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType, int FunctorHasJoin, int UseReducerType>
struct ParallelReduceSpecialize {
static inline void execute(const FunctorType& f, const PolicyType& p , PointerType result_ptr) {
printf("Error: Invalid Specialization %i %i\n",FunctorHasJoin,UseReducerType);
}
};
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
typedef Kokkos::RangePolicy<PolicyArgs...> PolicyType;
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename PolicyType::member_type begin = p.begin();
const typename PolicyType::member_type end = p.end();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
for(int i=begin; i<end; i++)
f(i,result);
*result_ptr=result;
}
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename PolicyType::member_type begin = p.begin();
const typename PolicyType::member_type end = p.end();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
for(int i=begin; i<end; i++)
f(TagType(),i,result);
*result_ptr=result;
}
inline static
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
}
};
/*
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType>
struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, PointerType, ValueType, 0,1> {
#pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, omp_in)) initializer ( ReducerType::init(omp_priv) )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename PolicyType::member_type begin = p.begin();
const typename PolicyType::member_type end = p.end();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(custom: result)
for(int i=begin; i<end; i++)
f(i,result);
*result_ptr=result;
}
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const typename PolicyType::member_type begin = p.begin();
const typename PolicyType::member_type end = p.end();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(custom: result)
for(int i=begin; i<end; i++)
f(TagType(),i,result);
*result_ptr=result;
}
inline static
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
}
};
*/
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ...>
, ReducerType
, Kokkos::Experimental::OpenMPTarget
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
enum {UseReducer = is_reducer_type<ReducerType>::value };
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
public:
inline void execute() const {
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
}
template< class ViewType >
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ViewType & arg_result_view
, typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Experimental::OpenMPTarget
>
{
private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
/*
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork , update , final );
}
}
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork , update , final );
}
}
*/
public:
inline
void execute() const
{
/* OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_scan");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_scan");
OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
#pragma omp parallel
{
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr =
pointer_type( exec.scratch_reduce() ) +
ValueTraits::value_count( m_functor );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , ptr ) , false );
}
{
const unsigned thread_count = OpenMPTargetExec::pool_size();
const unsigned value_count = ValueTraits::value_count( m_functor );
pointer_type ptr_prev = 0 ;
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
pointer_type ptr = pointer_type( OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
if ( ptr_prev ) {
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
ValueJoin::join( m_functor , ptr + value_count , ptr );
}
else {
ValueInit::init( m_functor , ptr );
}
ptr_prev = ptr ;
}
}
#pragma omp parallel
{
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueOps::reference( ptr ) , true );
}
*/
}
//----------------------------------------
inline
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
//----------------------------------------
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Properties >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, Kokkos::Experimental::OpenMPTarget
>
{
private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
const int m_shmem_size ;
public:
inline void execute() const {
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
execute_impl<WorkTag>();
}
private:
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
execute_impl() const
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const int league_size = m_policy.league_size();
const int team_size = m_policy.team_size();
const int vector_length = m_policy.vector_length();
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
map(to:this->m_functor,scratch_ptr)
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
m_functor(team);
}
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
execute_impl() const
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const int league_size = m_policy.league_size();
const int team_size = m_policy.team_size();
const int vector_length = m_policy.vector_length();
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
map(to:this->m_functor,scratch_ptr)
for(int i=0 ; i<league_size ; i++) {
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
m_functor(TagType(), team);
}
}
public:
inline
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
};
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
typedef TeamPolicyInternal<PolicyArgs...> PolicyType;
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const int league_size = p.league_size();
const int team_size = p.team_size();
const int vector_length = p.vector_length();
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
f(team,result);
if(team.m_vector_lane!=0) result = 0;
}
*result_ptr=result;
}
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
{
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
const int league_size = p.league_size();
const int team_size = p.team_size();
const int vector_length = p.vector_length();
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
ValueType result = ValueType();
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
f(TagType(),team,result);
if(team.vector_lane!=0) result = 0;
}
*result_ptr=result;
}
inline static
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
}
};
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Experimental::OpenMPTarget
>
{
private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename ValueTraits::value_type value_type ;
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
enum {UseReducer = is_reducer_type<ReducerType>::value };
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
const FunctorType m_functor ;
const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ;
const int m_shmem_size ;
public:
inline
void execute() const {
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
}
template< class ViewType >
inline
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
}
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
typedef iType index_type;
const iType start;
const iType end;
const iType increment;
inline
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
start( thread_.team_rank() ),
end( count ),
increment( thread_.team_size() )
{}
inline
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
start( begin_+thread_.team_rank() ),
end( end_ ),
increment( thread_.team_size() )
{}
};
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
typedef iType index_type;
const iType start;
const iType end;
const iType increment;
inline
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
start( thread_.m_vector_lane ),
end( count ),
increment( thread_.m_vector_length )
{}
inline
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
start( begin_+thread_.m_vector_lane ),
end( end_ ),
increment( thread_.m_vector_length )
{}
};
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
}
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */

View File

@ -0,0 +1,329 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
//----------------------------------------------------------------------------
TaskExec< Kokkos::Experimental::OpenMPTarget >::
TaskExec()
: m_self_exec( 0 )
, m_team_exec( 0 )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( 0 )
, m_team_rank( 0 )
, m_team_size( 1 )
{
}
TaskExec< Kokkos::Experimental::OpenMPTarget >::
TaskExec( Kokkos::Impl::OpenMPTargetExec & arg_exec , int const arg_team_size )
: m_self_exec( & arg_exec )
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
, m_team_size( arg_team_size )
{
// This team spans
// m_self_exec->pool_rev( team_size * group_rank )
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
sync[0] = int64_t(0) ;
sync[1] = int64_t(0) ;
for ( int i = 0 ; i < m_team_size ; ++i ) {
m_sync_value |= int64_t(1) << (8*i);
m_sync_mask |= int64_t(3) << (8*i);
}
Kokkos::memory_fence();
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void TaskExec< Kokkos::Experimental::OpenMPTarget >::team_barrier_impl() const
{
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small");
}
// Use team shared memory to synchronize.
// Alternate memory locations between barriers to avoid a sequence
// of barriers overtaking one another.
int64_t volatile * const sync =
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
// This team member sets one byte within the sync variable
int8_t volatile * const sync_self =
((int8_t *) sync) + m_team_rank ;
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
while ( m_sync_value != *sync ); // wait for team to arrive
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
++m_sync_step ;
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
m_sync_value ^= m_sync_mask ;
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
}
}
#endif
//----------------------------------------------------------------------------
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::execute
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
{
using execution_space = Kokkos::Experimental::OpenMPTarget ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
using Member = TaskExec< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
// Required: team_size <= 8
const int team_size = PoolExec::pool_size(2); // Threads per core
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
if ( 8 < team_size ) {
Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size");
}
#pragma omp parallel
{
PoolExec & self = *PoolExec::get_thread_omp();
Member single_exec ;
Member team_exec( self , team_size );
// Team shared memory
task_root_type * volatile * const task_shared =
(task_root_type **) team_exec.m_team_exec->scratch_thread();
// Barrier across entire OpenMPTarget thread pool to insure initialization
#pragma omp barrier
// Loop until all queues are empty and no tasks in flight
do {
task_root_type * task = 0 ;
// Each team lead attempts to acquire either a thread team task
// or a single thread task for the team.
if ( 0 == team_exec.team_rank() ) {
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
}
// Team lead broadcast acquired task to team members:
if ( 1 < team_exec.team_size() ) {
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
// Fence to be sure task_shared is stored before the barrier
Kokkos::memory_fence();
// Whole team waits for every team member to reach this statement
team_exec.team_barrier();
// Fence to be sure task_shared is stored
Kokkos::memory_fence();
task = *task_shared ;
}
#if 0
fprintf( stdout
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
, team_exec.m_group_rank
, team_exec.m_team_rank
, uintptr_t(task_shared)
, uintptr_t(task)
);
fflush(stdout);
#endif
if ( 0 == task ) break ; // 0 == m_ready_count
if ( end == task ) {
// All team members wait for whole team to reach this statement.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
else if ( task_root_type::TaskTeam == task->m_task_type ) {
// Thread Team Task
(*task->m_apply)( task , & team_exec );
// The m_apply function performs a barrier
if ( 0 == team_exec.team_rank() ) {
// team member #0 completes the task, which may delete the task
queue->complete( task );
}
}
else {
// Single Thread Task
if ( 0 == team_exec.team_rank() ) {
(*task->m_apply)( task , & single_exec );
queue->complete( task );
}
// All team members wait for whole team to reach this statement.
// Not necessary to complete the task.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
} while(1);
}
// END #pragma omp parallel
}
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::
iff_single_thread_recursive_execute
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
{
using execution_space = Kokkos::Experimental::OpenMPTarget ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec ;
task_root_type * task = end ;
do {
task = end ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
if ( end == task ) break ;
(*task->m_apply)( task , & single_exec );
queue->complete( task );
} while(1);
}
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,356 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
#define KOKKOS_IMPL_OPENMP_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >
{
public:
using execution_space = Kokkos::Experimental::OpenMPTarget ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
static
void iff_single_thread_recursive_execute( queue_type * const );
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::Experimental::OpenMPTarget
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
};
extern template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::Experimental::OpenMPTarget >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget > ;
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
int64_t m_sync_mask ;
int64_t mutable m_sync_value ;
int mutable m_sync_step ;
int m_group_rank ; ///< Which "team" subset of thread pool
int m_team_rank ; ///< Which thread within a team
int m_team_size ;
TaskExec();
TaskExec( PoolExec & arg_exec , int arg_team_size );
void team_barrier_impl() const ;
public:
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void * team_shared() const
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
int team_shared_size() const
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
/**\brief Whole team enters this function call
* before any teeam member returns from
* this function call.
*/
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
#else
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
#endif
KOKKOS_INLINE_FUNCTION
int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION
int team_size() const { return m_team_size ; }
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >
TeamThreadRange
( Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >
TeamThreadRange
( Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
, const iType & start
, const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,start,end);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
template<typename iType, class Lambda, typename ValueType>
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
, const Lambda& lambda
, ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
shared[0] += shared[i];
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
join(shared[0], shared[i]);
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
int team_size = loop_boundaries.thread.team_size();
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
// Intra-member scan
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
shared[team_rank] = accum;
loop_boundaries.thread.team_barrier();
// Member 0 do scan on accumulated totals
if (team_rank == 0) {
for( iType i = 1; i < team_size; i+=1) {
shared[i] += shared[i-1];
}
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
}
loop_boundaries.thread.team_barrier();
// Inter-member scan adding in accumulated totals
if (team_rank != 0) { accum = shared[team_rank-1]; }
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */

View File

@ -41,12 +41,13 @@
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <stdio.h>
#include <stdlib.h>
#include <Kokkos_Core_fwd.hpp>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <utility>
@ -215,6 +216,8 @@ void Qthreads::fence()
int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
const char* Qthreads::name() { return "Qthreads"; }
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -516,4 +519,7 @@ QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMemb
} // namespace Kokkos
#else
void KOKKOS_SRC_QTHREADS_EXEC_PREVENT_LINK_ERROR() {}
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )

View File

@ -44,6 +44,9 @@
#ifndef KOKKOS_QTHREADSEXEC_HPP
#define KOKKOS_QTHREADSEXEC_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
@ -637,4 +640,6 @@ public:
//----------------------------------------------------------------------------
#endif
#endif // #define KOKKOS_QTHREADSEXEC_HPP

View File

@ -44,6 +44,9 @@
#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
#define KOKKOS_QTHREADS_PARALLEL_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <vector>
#include <Kokkos_Parallel.hpp>
@ -724,4 +727,6 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& sing
} // namespace Kokkos
#endif
#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */

View File

@ -41,10 +41,10 @@
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
@ -314,7 +314,7 @@ void TaskQueueSpecialization< Kokkos::Qthreads >::
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#else
void KOKKOS_SRC_QTHREADS_TASK_PREVENT_LINK_ERROR() {}
#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -44,7 +44,8 @@
#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
#define KOKKOS_IMPL_QTHREADS_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,20 +36,22 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD.
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <stdio.h>
#include <Kokkos_Core_fwd.hpp>
#include <cstdio>
#include <cstdlib>
#include <stdlib.h>
#include <stdexcept>
#include <iostream>
#include <sstream>
@ -448,7 +450,7 @@ TaskPolicy< Kokkos::Qthreads >::
TaskPolicy
( const unsigned /* arg_task_max_count */
, const unsigned /* arg_task_max_size */
, const unsigned arg_task_default_dependence_capacity
, const unsigned arg_task_default_dependence_capacity
, const unsigned arg_task_team_size
)
: m_default_dependence_capacity( arg_task_default_dependence_capacity )
@ -484,5 +486,8 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
} // namespace Experimental
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_QTHREADS_KOKKOS_QTHREADS_TASKPOLICY_PREVENT_LINK_ERROR() {}
#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -46,6 +46,9 @@
#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
#include <string>
#include <typeinfo>
#include <stdexcept>
@ -69,7 +72,6 @@
#include <impl/Kokkos_FunctorAdapter.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------
@ -505,7 +507,7 @@ public:
template< class ValueType >
const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f
spawn( const Future< ValueType , execution_space > & f
, const bool priority = false ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@ -632,7 +634,7 @@ public:
}
template< class FunctorType >
void respawn( FunctorType * task_functor
void respawn( FunctorType * task_functor
, const bool priority = false ) const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )

View File

@ -41,7 +41,11 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_TASKPOLICY )
#ifndef KOKKOS_QTHREADS_TASKQUEUE_HPP
#define KOKKOS_QTHREADS_TASKQUEUE_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -60,7 +64,7 @@ private:
using execution_space = Kokkos::Qthread ;
using memory_space = Kokkos::HostSpace
using device_type = Kokkos::Device< execution_space, memory_space > ;
using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ;
using memory_pool = Kokkos::MemoryPool< device_type > ;
using task_root_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
friend class Kokkos::TaskScheduler< execution_space > ;
@ -317,3 +321,5 @@ public:
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif // KOKKOS_QTHREADS_TASKQUEUE_HPP

View File

@ -41,7 +41,11 @@
//@HEADER
*/
#if defined( KOKKOS_ENABLE_TASKPOLICY )
#ifndef KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
#define KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
namespace Kokkos {
namespace Impl {
@ -433,4 +437,5 @@ fflush(stdout);
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif // KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP

View File

@ -41,11 +41,13 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
#include <stdint.h>
#include <cstdint>
#include <limits>
#include <utility>
#include <iostream>
@ -512,8 +514,8 @@ void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
s << "Kokkos::Threads" ;
#if defined( KOKKOS_ENABLE_PTHREAD )
s << " KOKKOS_ENABLE_PTHREAD" ;
#if defined( KOKKOS_ENABLE_THREADS )
s << " KOKKOS_ENABLE_THREADS" ;
#endif
#if defined( KOKKOS_ENABLE_HWLOC )
s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
@ -817,10 +819,12 @@ int Threads::thread_pool_rank()
}
#endif
const char* Threads::name() { return "Threads"; }
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD ) */
#else
void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */

View File

@ -44,7 +44,10 @@
#ifndef KOKKOS_THREADSEXEC_HPP
#define KOKKOS_THREADSEXEC_HPP
#include <stdio.h>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#include <cstdio>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
@ -626,6 +629,6 @@ inline void Threads::fence()
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif
#endif /* #define KOKKOS_THREADSEXEC_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,18 +36,16 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_PTHREAD )
#include <Kokkos_Core_fwd.hpp>
/* Standard 'C' Linux libraries */
#include <pthread.h>
@ -154,6 +152,8 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )
#elif defined( KOKKOS_ENABLE_WINTHREAD )
#include <Kokkos_Core_fwd.hpp>
/* Windows libraries */
#include <winsock2.h>
#include <windows.h>
@ -247,9 +247,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
} // namespace Impl
} // namespace Kokkos
#else
void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
#endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -44,7 +44,10 @@
#ifndef KOKKOS_THREADSTEAM_HPP
#define KOKKOS_THREADSTEAM_HPP
#include <stdio.h>
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#include <cstdio>
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
@ -173,7 +176,9 @@ public:
}
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
KOKKOS_INLINE_FUNCTION
typename std::enable_if< !Kokkos::is_reducer< Type >::value , Type>::type
team_reduce( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
@ -202,6 +207,55 @@ public:
}
#endif
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
team_reduce( const ReducerType & ) const
{}
#else
team_reduce( const ReducerType & reducer ) const
{
typedef typename ReducerType::value_type value_type;
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
if ( 0 == m_exec ) return ;
type * const local_value = ((type*) m_exec->scratch_memory());
// Set this thread's contribution
*local_value = reducer.reference() ;
// Fence to make sure the base team member has access:
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
type * const team_value = ((type*) m_team_base[0]->scratch_memory());
// Join to the team value:
for ( int i = 1 ; i < m_team_size ; ++i ) {
reducer.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
}
// Team base thread may "lap" member threads so copy out to their local value.
for ( int i = 1 ; i < m_team_size ; ++i ) {
*((type*) m_team_base[i]->scratch_memory()) = *team_value ;
}
// Fence to make sure all team members have access
memory_fence();
}
team_fan_out();
// Value was changed by the team base
reducer.reference() = *((type volatile const *) local_value);
}
#endif
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION ValueType
team_reduce( const ValueType & value
@ -760,7 +814,8 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thread
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
@ -774,6 +829,21 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thr
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
const Lambda & lambda, const ReducerType& reducer) {
reducer.init(reducer.reference());
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
loop_boundaries.thread.team_reduce(reducer);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -822,17 +892,26 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Thre
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
}
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda & lambda, const ReducerType& reducer) {
reducer.init(reducer.reference());
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -915,6 +994,6 @@ void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif
#endif /* #define KOKKOS_THREADSTEAM_HPP */

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,8 +44,11 @@
#ifndef KOKKOS_THREADS_PARALLEL_HPP
#define KOKKOS_THREADS_PARALLEL_HPP
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_THREADS )
#include <vector>
#include <iostream>
#include <iostream>
#include <Kokkos_Parallel.hpp>
@ -83,7 +86,7 @@ private:
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -99,7 +102,7 @@ private:
, const Member ibeg , const Member iend )
{
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -308,7 +311,7 @@ private:
, const Member & ibeg , const Member & iend
, reference_type update )
{
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -325,7 +328,7 @@ private:
, reference_type update )
{
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -348,7 +351,7 @@ private:
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::template exec_range< WorkTag >
( self.m_functor , range.begin() , range.end()
( self.m_functor , range.begin() , range.end()
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
@ -428,7 +431,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
@ -543,7 +546,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_result_ptr( reducer.view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
@ -584,7 +587,7 @@ private:
, const Member & ibeg , const Member & iend
, reference_type update , const bool final )
{
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -601,7 +604,7 @@ private:
, reference_type update , const bool final )
{
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
#pragma ivdep
#endif
@ -654,5 +657,6 @@ public:
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif
#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */

View File

@ -44,24 +44,20 @@
#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
#include <iostream>
#include <algorithm>
#include <stdio.h>
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
#define KOKKOS_MDRANGE_IVDEP
#endif
#ifdef KOKKOS_MDRANGE_IVDEP
#define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
#else
#define KOKKOS_ENABLE_IVDEP_MDRANGE
#endif
#include <iostream>
#include <algorithm>
#include <cstdio>
namespace Kokkos { namespace Experimental { namespace Impl {
@ -243,7 +239,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
KOKKOS_ENABLE_IVDEP_MDRANGE \
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
APPLY( func, i0 + m_offset[0] ) \
}
}
#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -255,7 +251,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -267,7 +263,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -279,7 +275,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -291,7 +287,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -303,7 +299,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -315,7 +311,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -327,7 +323,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
} \
}
}
// Partial vs Full Tile
#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -459,7 +455,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
KOKKOS_ENABLE_IVDEP_MDRANGE \
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
APPLY_REDUX( val, func, i0 + m_offset[0] ) \
}
}
#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -471,7 +467,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -483,7 +479,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -495,7 +491,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -507,7 +503,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -519,7 +515,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -531,7 +527,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
} \
}
}
#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -543,7 +539,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
} \
}
}
// Partial vs Full Tile
#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -676,7 +672,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
KOKKOS_ENABLE_IVDEP_MDRANGE \
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
TAGGED_APPLY( tag, func, i0 + m_offset[0] ) \
}
}
#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -688,7 +684,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -700,7 +696,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -712,7 +708,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -724,7 +720,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -736,7 +732,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -748,7 +744,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -760,7 +756,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
} \
}
}
// Partial vs Full Tile
#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -892,7 +888,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
KOKKOS_ENABLE_IVDEP_MDRANGE \
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] ) \
}
}
#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -904,7 +900,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -916,7 +912,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -928,7 +924,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -940,7 +936,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -952,7 +948,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -964,7 +960,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
} \
}
}
#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
if (is_left) { \
@ -976,7 +972,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
} \
}
}
// Partial vs Full Tile
#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
@ -1321,8 +1317,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
else {
is_full_tile = false ;
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
}
}
@ -1332,7 +1328,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
template <int Rank>
struct RankTag
struct RankTag
{
typedef RankTag type;
enum { value = (int)Rank };
@ -1343,13 +1339,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
inline
void
operator()(IType tile_idx) const
{
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1361,13 +1357,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
}
#else
#else
template <typename IType>
inline
void
@ -1384,7 +1380,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1396,7 +1392,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
@ -1437,7 +1433,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1490,7 +1486,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1543,7 +1539,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1596,7 +1592,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1649,7 +1645,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1702,7 +1698,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1804,8 +1800,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
else {
is_full_tile = false ;
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
}
}
@ -1815,7 +1811,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
template <int Rank>
struct RankTag
struct RankTag
{
typedef RankTag type;
enum { value = (int)Rank };
@ -1827,13 +1823,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
inline
void
operator()(IType tile_idx) const
{
{
point_type m_offset;
point_type m_tiledims;
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1845,13 +1841,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
}
#else
#else
template <typename IType>
inline
void
@ -1869,7 +1865,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1881,7 +1877,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
}
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
if (RP::inner_direction == RP::Left) {
if ( full_tile ) {
@ -1922,7 +1918,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -1975,7 +1971,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -2028,7 +2024,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -2081,7 +2077,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -2134,7 +2130,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -2187,7 +2183,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
if (RP::outer_direction == RP::Left) {
for (int i=0; i<RP::rank; ++i) {
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
tile_idx /= m_rp.m_tile_end[i];
}
}
@ -2352,5 +2348,5 @@ struct MDFunctor< MDRange, Functor, void >
} } } //end namespace Kokkos::Experimental::Impl
#endif

View File

@ -195,3 +195,4 @@ struct PolicyTraits
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,10 +36,12 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
#define KOKKOS_ATOMIC_ASSEMBLY_HPP
namespace Kokkos {
@ -110,3 +112,4 @@ namespace Impl {
}
#endif

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
#define KOKKOS_ATOMIC_DECREMENT_HPP
@ -117,3 +118,4 @@ void atomic_decrement(volatile T* a) {
} // End of namespace Kokkos
#endif

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
#define KOKKOS_ATOMIC_EXCHANGE_HPP
@ -356,5 +357,3 @@ void atomic_assign( volatile T * const dest , const T val )
#endif
//----------------------------------------------------------------------------

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
@ -279,6 +280,7 @@ T atomic_fetch_add( volatile T * const dest ,
{
while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest;
// Don't use the following line of code here:
//
//const T tmp = *dest = return_val + val;
@ -292,6 +294,7 @@ T atomic_fetch_add( volatile T * const dest ,
const T tmp = *dest;
(void) tmp;
Impl::unlock_address_host_space( (void*) dest );
return return_val;
}
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,11 +36,12 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
#define KOKKOS_ATOMIC_FETCH_AND_HPP
@ -124,4 +125,3 @@ void atomic_and(volatile T * const dest, const T src) {
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,11 +36,12 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
#define KOKKOS_ATOMIC_FETCH_OR_HPP
@ -124,4 +125,3 @@ void atomic_or(volatile T * const dest, const T src) {
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,11 +36,12 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
@ -238,4 +239,3 @@ void atomic_sub(volatile T * const dest, const T src) {
#include<impl/Kokkos_Atomic_Assembly.hpp>
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,10 +36,12 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
#define KOKKOS_ATOMIC_GENERIC_HPP
#include <Kokkos_Macros.hpp>
@ -424,6 +426,6 @@ T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
}
}
} // namespace Kokkos
#endif

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
#define KOKKOS_ATOMIC_INCREMENT_HPP
@ -115,3 +116,4 @@ void atomic_increment(volatile T* a) {
} // End of namespace Kokkos
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -428,3 +428,4 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
}} // namespace Kokkos::Impl
#endif

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,12 +36,13 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
#define KOKKOS_ATOMIC_WINDOWS_HPP
#ifdef _WIN32
#define NOMINMAX

View File

@ -45,12 +45,38 @@
#define KOKKOS_BITOPS_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <cstdint>
#include <climits>
namespace Kokkos {
namespace Impl {
/**\brief Find first zero bit.
*
* If none then return -1 ;
*/
KOKKOS_FORCEINLINE_FUNCTION
int bit_first_zero( unsigned i ) noexcept
{
enum : unsigned { full = ~0u };
#if defined( __CUDA_ARCH__ )
return full != i ? __ffs( ~i ) - 1 : -1 ;
#elif defined( KOKKOS_COMPILER_INTEL )
return full != i ? _bit_scan_forward( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_IBM )
return full != i ? __cnttz4( ~i ) : -1 ;
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
#else
int offset = -1 ;
if ( full != i ) {
for ( offset = 0 ; i & ( 1 << offset ) ; ++offset );
}
return offset ;
#endif
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_forward( unsigned i )
{
@ -121,7 +147,16 @@ int bit_count( unsigned i )
#endif
}
KOKKOS_INLINE_FUNCTION
unsigned integral_power_of_two_that_contains( const unsigned N )
{
const unsigned i = Kokkos::Impl::bit_scan_reverse( N );
return ( (1u << i) < N ) ? i + 1 : i ;
}
} // namespace Impl
} // namespace Kokkos
#endif // KOKKOS_BITOPS_HPP

View File

@ -122,3 +122,4 @@ int mpi_local_rank_on_node() {
}
}

View File

@ -0,0 +1,106 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CLOCKTIC_HPP
#define KOKKOS_CLOCKTIC_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <chrono>
namespace Kokkos {
namespace Impl {
/**\brief Quick query of clock register tics
*
* Primary use case is to, with low overhead,
* obtain a integral value that consistently varies
* across concurrent threads of execution within
* a parallel algorithm.
* This value is often used to "randomly" seed an
* attempt to acquire an indexed resource (e.g., bit)
* from an array of resources (e.g., bitset) such that
* concurrent threads will have high likelihood of
* having different index-seed values.
*/
KOKKOS_FORCEINLINE_FUNCTION
uint64_t clock_tic(void) noexcept
{
#if defined( __CUDA_ARCH__ )
// Return value of 64-bit hi-res clock register.
return clock64();
#elif defined( __i386__ ) || defined( __x86_64 )
// Return value of 64-bit hi-res clock register.
unsigned a = 0, d = 0;
__asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
#elif defined( __powerpc ) || defined( __powerpc__ ) || \
defined( __powerpc64__ ) || defined( __POWERPC__ ) || \
defined( __ppc__ ) || defined( __ppc64__ )
unsigned int cycles = 0;
asm volatile( "mftb %0" : "=r" (cycles) );
return (uint64_t) cycles;
#else
return (uint64_t)
std::chrono::high_resolution_clock::now().time_since_epoch().count();
#endif
}
} // namespace Impl
} // namespace Kokkos
#endif // KOKKOS_CLOCKTIC_HPP

View File

@ -0,0 +1,357 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CONCURRENTBITSET_HPP
#define KOKKOS_CONCURRENTBITSET_HPP
#include <stdint.h>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_BitOps.hpp>
#include <impl/Kokkos_ClockTic.hpp>
namespace Kokkos {
namespace Impl {
struct concurrent_bitset {
public:
// 32 bits per integer value
enum : uint32_t { bits_per_int_lg2 = 5 };
enum : uint32_t { bits_per_int_mask = ( 1 << bits_per_int_lg2 ) - 1 };
// Buffer is uint32_t[ buffer_bound ]
// [ uint32_t { state_header | used_count } , uint32_t bits[*] ]
//
// Maximum bit count is 33 million (1u<<25):
//
// - Maximum bit set size occupies 1 Mbyte
//
// - State header can occupy bits [30-26]
// which can be the bit_count_lg2
//
// - Accept at least 33 million concurrent calls to 'acquire'
// before risking an overflow race condition on a full bitset.
enum : uint32_t { max_bit_count_lg2 = 25 };
enum : uint32_t { max_bit_count = 1u << max_bit_count_lg2 };
enum : uint32_t { state_shift = 26 };
enum : uint32_t { state_used_mask = ( 1 << state_shift ) - 1 };
enum : uint32_t { state_header_mask = uint32_t(0x001f) << state_shift };
KOKKOS_INLINE_FUNCTION static constexpr
uint32_t buffer_bound_lg2( uint32_t const bit_bound_lg2 ) noexcept
{
return bit_bound_lg2 <= max_bit_count_lg2
? 1 + ( 1u << ( bit_bound_lg2 > bits_per_int_lg2
? bit_bound_lg2 - bits_per_int_lg2 : 0 ) )
: 0 ;
}
/**\brief Initialize bitset buffer */
KOKKOS_INLINE_FUNCTION static constexpr
uint32_t buffer_bound( uint32_t const bit_bound ) noexcept
{
return bit_bound <= max_bit_count
? 1 + ( bit_bound >> bits_per_int_lg2 ) +
( bit_bound & bits_per_int_mask ? 1 : 0 )
: 0 ;
}
/**\brief Claim any bit within the bitset bound.
*
* Return : ( which_bit , bit_count )
*
* if success then
* bit_count is the atomic-count of claimed > 0
* which_bit is the claimed bit >= 0
* else if attempt failed due to filled buffer
* bit_count == which_bit == -1
* else if attempt failed due to non-matching state_header
* bit_count == which_bit == -2
* else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
* or invalid state_header
* or (1u << bit_bound_lg2) <= bit
* bit_count == which_bit == -3
* endif
*
* Recommended to have hint
* bit = Kokkos::Impl::clock_tic() & ((1u<<bit_bound_lg2) - 1)
*/
KOKKOS_INLINE_FUNCTION static
Kokkos::pair<int,int>
acquire_bounded_lg2( uint32_t volatile * const buffer
, uint32_t const bit_bound_lg2
, uint32_t bit = 0 /* optional hint */
, uint32_t const state_header = 0 /* optional header */
) noexcept
{
typedef Kokkos::pair<int,int> type ;
const uint32_t bit_bound = 1 << bit_bound_lg2 ;
const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
if ( ( max_bit_count_lg2 < bit_bound_lg2 ) ||
( state_header & ~state_header_mask ) ||
( bit_bound < bit ) ) {
return type(-3,-3);
}
// Use potentially two fetch_add to avoid CAS loop.
// Could generate "racing" failure-to-acquire
// when is full at the atomic_fetch_add(+1)
// then a release occurs before the atomic_fetch_add(-1).
const uint32_t state = (uint32_t)
Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
const uint32_t state_error =
state_header != ( state & state_header_mask );
const uint32_t state_bit_used = state & state_used_mask ;
if ( state_error || ( bit_bound <= state_bit_used ) ) {
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
return state_error ? type(-2,-2) : type(-1,-1);
}
// Do not update bit until count is visible:
Kokkos::memory_fence();
// There is a zero bit available somewhere,
// now find the (first) available bit and set it.
while(1) {
const uint32_t word = bit >> bits_per_int_lg2 ;
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
if ( ! ( prev & mask ) ) {
// Successfully claimed 'result.first' by
// atomically setting that bit.
return type( bit , state_bit_used + 1 );
}
// Failed race to set the selected bit
// Find a new bit to try.
const int j = Kokkos::Impl::bit_first_zero( prev );
if ( 0 <= j ) {
bit = ( word << bits_per_int_lg2 ) | uint32_t(j);
}
else {
bit =
( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
| ( bit & bits_per_int_mask );
}
}
}
/**\brief Claim any bit within the bitset bound.
*
* Return : ( which_bit , bit_count )
*
* if success then
* bit_count is the atomic-count of claimed > 0
* which_bit is the claimed bit >= 0
* else if attempt failed due to filled buffer
* bit_count == which_bit == -1
* else if attempt failed due to non-matching state_header
* bit_count == which_bit == -2
* else if attempt failed due to max_bit_count_lg2 < bit_bound_lg2
* or invalid state_header
* or bit_bound <= bit
* bit_count == which_bit == -3
* endif
*
* Recommended to have hint
* bit = Kokkos::Impl::clock_tic() % bit_bound
*/
KOKKOS_INLINE_FUNCTION static
Kokkos::pair<int,int>
acquire_bounded( uint32_t volatile * const buffer
, uint32_t const bit_bound
, uint32_t bit = 0 /* optional hint */
, uint32_t const state_header = 0 /* optional header */
) noexcept
{
typedef Kokkos::pair<int,int> type ;
if ( ( max_bit_count < bit_bound ) ||
( state_header & ~state_header_mask ) ||
( bit_bound <= bit ) ) {
return type(-3,-3);
}
const uint32_t word_count = bit_bound >> bits_per_int_lg2 ;
// Use potentially two fetch_add to avoid CAS loop.
// Could generate "racing" failure-to-acquire
// when is full at the atomic_fetch_add(+1)
// then a release occurs before the atomic_fetch_add(-1).
const uint32_t state = (uint32_t)
Kokkos::atomic_fetch_add( (volatile int *) buffer , 1 );
const uint32_t state_error =
state_header != ( state & state_header_mask );
const uint32_t state_bit_used = state & state_used_mask ;
if ( state_error || ( bit_bound <= state_bit_used ) ) {
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
return state_error ? type(-2,-2) : type(-1,-1);
}
// Do not update bit until count is visible:
Kokkos::memory_fence();
// There is a zero bit available somewhere,
// now find the (first) available bit and set it.
while(1) {
const uint32_t word = bit >> bits_per_int_lg2 ;
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask);
if ( ! ( prev & mask ) ) {
// Successfully claimed 'result.first' by
// atomically setting that bit.
return type( bit , state_bit_used + 1 );
}
// Failed race to set the selected bit
// Find a new bit to try.
const int j = Kokkos::Impl::bit_first_zero( prev );
if ( 0 <= j ) {
bit = (word << bits_per_int_lg2 ) | uint32_t(j);
}
if ( ( j < 0 ) || ( bit_bound <= bit ) ) {
bit =
( (word+1) < word_count ? ((word+1) << bits_per_int_lg2) : 0 )
| ( bit & bits_per_int_mask );
}
}
}
/**\brief
*
* Requires: 'bit' previously acquired and has not yet been released.
*
* Returns:
* 0 <= used count after successful release
* -1 bit was already released
* -2 state_header error
*/
KOKKOS_INLINE_FUNCTION static
int release( uint32_t volatile * const buffer
, uint32_t const bit
, uint32_t const state_header = 0 /* optional header */
) noexcept
{
if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
const uint32_t prev =
Kokkos::atomic_fetch_and( buffer + ( bit >> bits_per_int_lg2 ) + 1
, ~mask
);
if ( ! ( prev & mask ) ) { return -1 ; }
// Do not update count until bit clear is visible
Kokkos::memory_fence();
const int count =
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
return ( count & state_used_mask ) - 1 ;
}
/**\brief
*
* Requires: Bit within bounds and not already set.
*
* Returns:
* 0 <= used count after successful release
* -1 bit was already released
* -2 bit or state_header error
*/
KOKKOS_INLINE_FUNCTION static
int set( uint32_t volatile * const buffer
, uint32_t const bit
, uint32_t const state_header = 0 /* optional header */
) noexcept
{
if ( state_header != ( state_header_mask & *buffer ) ) { return -2 ; }
const uint32_t mask = 1u << ( bit & bits_per_int_mask );
const uint32_t prev =
Kokkos::atomic_fetch_or( buffer + ( bit >> bits_per_int_lg2 ) + 1
, mask
);
if ( ! ( prev & mask ) ) { return -1 ; }
// Do not update count until bit clear is visible
Kokkos::memory_fence();
const int count =
Kokkos::atomic_fetch_add( (volatile int *) buffer , -1 );
return ( count & state_used_mask ) - 1 ;
}
};
}} // namespace Kokkos::Impl
#endif /* #ifndef KOKKOS_CONCURRENTBITSET_HPP */

View File

@ -46,6 +46,7 @@
#include <cctype>
#include <cstring>
#include <iostream>
#include <sstream>
#include <cstdlib>
//----------------------------------------------------------------------------
@ -75,10 +76,10 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
#endif
// Protect declarations, to prevent "unused variable" warnings.
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
const int num_threads = args.num_threads;
const int use_numa = args.num_numa;
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_PTHREAD )
#endif // defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS )
#if defined( KOKKOS_ENABLE_CUDA )
const int use_gpu = args.device_id;
#endif // defined( KOKKOS_ENABLE_CUDA )
@ -103,7 +104,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
}
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
if(num_threads>0) {
@ -135,6 +136,25 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
}
#endif
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
if( Impl::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value ) {
if(num_threads>0) {
if(use_numa>0) {
Kokkos::Experimental::OpenMPTarget::initialize(num_threads,use_numa);
}
else {
Kokkos::Experimental::OpenMPTarget::initialize(num_threads);
}
} else {
Kokkos::Experimental::OpenMPTarget::initialize();
}
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
}
else {
//std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
}
#endif
#if defined( KOKKOS_ENABLE_CUDA )
if( std::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
if (use_gpu > -1) {
@ -166,6 +186,13 @@ void finalize_internal( const bool all_spaces = false )
}
#endif
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
if( std::is_same< Kokkos::Experimental::OpenMPTarget , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
if(Kokkos::Experimental::OpenMPTarget::is_initialized())
Kokkos::Experimental::OpenMPTarget::finalize();
}
#endif
#if defined( KOKKOS_ENABLE_OPENMP )
if( std::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
@ -175,7 +202,7 @@ void finalize_internal( const bool all_spaces = false )
}
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
all_spaces ) {
@ -210,7 +237,7 @@ void fence_internal()
}
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
if( std::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
std::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
Kokkos::Threads::fence();
@ -226,6 +253,39 @@ void fence_internal()
}
bool check_arg(char const* arg, char const* expected) {
std::size_t arg_len = std::strlen(arg);
std::size_t exp_len = std::strlen(expected);
if (arg_len < exp_len) return false;
if (std::strncmp(arg, expected, exp_len) != 0) return false;
if (arg_len == exp_len) return true;
/* if expected is "--threads", ignore "--threads-for-application"
by checking this character ---------^
to see if it continues to make a longer name */
if (std::isalnum(arg[exp_len]) || arg[exp_len] == '-' || arg[exp_len] == '_') {
return false;
}
return true;
}
bool check_int_arg(char const* arg, char const* expected, int* value) {
if (!check_arg(arg, expected)) return false;
std::size_t arg_len = std::strlen(arg);
std::size_t exp_len = std::strlen(expected);
bool okay = true;
if (arg_len == exp_len || arg[exp_len] != '=') okay = false;
char const* number = arg + exp_len + 1;
if (!Impl::is_unsigned_int(number) || strlen(number) == 0) okay = false;
*value = std::atoi(number);
if (!okay) {
std::ostringstream ss;
ss << "Error: expecting an '=INT' after command line argument '" << expected << "'";
ss << ". Raised by Kokkos::initialize(int narg, char* argc[]).";
Impl::throw_runtime_exception( ss.str() );
}
return true;
}
} // namespace
} // namespace Impl
} // namespace Kokkos
@ -248,76 +308,31 @@ void initialize(int& narg, char* arg[])
int iarg = 0;
while (iarg < narg) {
if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
//Find the number of threads (expecting --threads=XX)
if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
num_threads = atoi(number);
//Remove the --kokkos-threads argument from the list but leave --threads
if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_threads_found=1;
narg--;
} else {
iarg++;
if (Impl::check_int_arg(arg[iarg], "--kokkos-threads", &num_threads)) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
} else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
//Find the number of numa (expecting --numa=XX)
if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
numa = atoi(number);
//Remove the --kokkos-numa argument from the list but leave --numa
if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_numa_found=1;
narg--;
} else {
iarg++;
kokkos_threads_found=1;
narg--;
} else if (!kokkos_threads_found && Impl::check_int_arg(arg[iarg], "--threads", &num_threads)) {
iarg++;
} else if (Impl::check_int_arg(arg[iarg], "--kokkos-numa", &numa)) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
} else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
//Find the number of device (expecting --device=XX)
if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
char* number = strchr(arg[iarg],'=')+1;
if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
device = atoi(number);
//Remove the --kokkos-device argument from the list but leave --device
if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
kokkos_device_found=1;
narg--;
} else {
iarg++;
kokkos_numa_found=1;
narg--;
} else if (!kokkos_numa_found && Impl::check_int_arg(arg[iarg], "--numa", &numa)) {
iarg++;
} else if (Impl::check_int_arg(arg[iarg], "--kokkos-device", &device)) {
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
} else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
kokkos_device_found=1;
narg--;
} else if (!kokkos_device_found && Impl::check_int_arg(arg[iarg], "--device", &device)) {
iarg++;
} else if (Impl::check_arg(arg[iarg], "--kokkos-ndevices") || Impl::check_arg(arg[iarg], "--ndevices")) {
//Find the number of device (expecting --device=XX)
if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
@ -511,20 +526,8 @@ void print_configuration( std::ostream & out , const bool detail )
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PTHREAD: ";
#ifdef KOKKOS_ENABLE_PTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_STDTHREAD: ";
#ifdef KOKKOS_ENABLE_STDTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_WINTHREAD: ";
#ifdef KOKKOS_ENABLE_WINTHREAD
msg << " KOKKOS_ENABLE_THREADS: ";
#ifdef KOKKOS_ENABLE_THREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
@ -754,7 +757,7 @@ void print_configuration( std::ostream & out , const bool detail )
#ifdef KOKKOS_ENABLE_OPENMP
OpenMP::print_configuration(msg, detail);
#endif
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
#if defined( KOKKOS_ENABLE_THREADS )
Threads::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_QTHREADS

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,14 +36,14 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <cstdio>
#include <cstring>
#include <cstdlib>
#include <ostream>
#include <sstream>
@ -127,7 +127,8 @@ std::string human_memory_size(size_t arg_bytes)
#include <execinfo.h>
#include <cxxabi.h>
#include <dlfcn.h>
#include <stdlib.h>
#include <cstdlib>
namespace Kokkos {
namespace Impl {

View File

@ -75,7 +75,9 @@ void abort( const char * const message ) {
#ifdef __CUDA_ARCH__
Kokkos::Impl::cuda_abort(message);
#else
Kokkos::Impl::host_abort(message);
#ifndef KOKKOS_ENABLE_OPENMPTARGET
Kokkos::Impl::host_abort(message);
#endif
#endif
}

View File

@ -1,3 +1,46 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
namespace Kokkos {
namespace Impl {
@ -17,3 +60,4 @@ Impl::PerThreadValue PerThread(const int& arg)
}
}

View File

@ -55,6 +55,46 @@
namespace Kokkos {
namespace Impl {
template< class FunctorType, class Enable = void>
struct ReduceFunctorHasInit {
enum {value = false};
};
template< class FunctorType>
struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
enum {value = true};
};
template< class FunctorType, class Enable = void>
struct ReduceFunctorHasJoin {
enum {value = false};
};
template< class FunctorType>
struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
enum {value = true};
};
template< class FunctorType, class Enable = void>
struct ReduceFunctorHasFinal {
enum {value = false};
};
template< class FunctorType>
struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
enum {value = true};
};
template< class FunctorType, class Enable = void>
struct ReduceFunctorHasShmemSize {
enum {value = false};
};
template< class FunctorType>
struct ReduceFunctorHasShmemSize<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type > {
enum {value = true};
};
template< class FunctorType , class ArgTag , class Enable = void >
struct FunctorDeclaresValueType : public Impl::false_type {};
@ -63,6 +103,21 @@ struct FunctorDeclaresValueType< FunctorType , ArgTag
, typename Impl::enable_if_type< typename FunctorType::value_type >::type >
: public Impl::true_type {};
template< class FunctorType, bool Enable =
( FunctorDeclaresValueType<FunctorType,void>::value) ||
( ReduceFunctorHasInit<FunctorType>::value ) ||
( ReduceFunctorHasJoin<FunctorType>::value ) ||
( ReduceFunctorHasFinal<FunctorType>::value ) ||
( ReduceFunctorHasShmemSize<FunctorType>::value )
>
struct IsNonTrivialReduceFunctor {
enum {value = false};
};
template< class FunctorType>
struct IsNonTrivialReduceFunctor<FunctorType, true> {
enum {value = true};
};
/** \brief Query Functor and execution policy argument tag for value type.
*

View File

@ -48,7 +48,6 @@
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Reducer.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -101,6 +100,29 @@ private:
using Tag = typename has_work_tag<>::type ;
using WTag = typename has_work_tag<>::wtag ;
//----------------------------------------
// Check for T::execution_space
template< typename T , typename = std::false_type >
struct has_execution_space { using type = void ; enum { value = false }; };
template< typename T >
struct has_execution_space
< T , typename std::is_same< typename T::execution_space , void >::type >
{
using type = typename T::execution_space ;
enum { value = true };
};
using policy_has_space = has_execution_space< Policy > ;
using functor_has_space = has_execution_space< Functor > ;
static_assert( ! policy_has_space::value ||
! functor_has_space::value ||
std::is_same< typename policy_has_space::type
, typename functor_has_space::type >::value
, "Execution Policy and Functor execution space must match" );
//----------------------------------------
// Check for Functor::value_type, which is either a simple type T or T[]
@ -190,6 +212,16 @@ private:
public:
using execution_space = typename std::conditional
< functor_has_space::value
, typename functor_has_space::type
, typename std::conditional
< policy_has_space::value
, typename policy_has_space::type
, Kokkos::DefaultExecutionSpace
>::type
>::type ;
using value_type = typename std::remove_extent< candidate_type >::type ;
static_assert( ! std::is_const< value_type >::value
@ -214,14 +246,14 @@ public:
private:
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
KOKKOS_INLINE_FUNCTION static constexpr
typename std::enable_if< IsArray , unsigned >::type
get_length( FF const & f ) { return f.value_count ; }
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
KOKKOS_INLINE_FUNCTION static constexpr
typename std::enable_if< ! IsArray , unsigned >::type
get_length( FF const & ) { return 1 ; }
get_length( FF const & ) { return candidate_is_void ? 0 : 1 ; }
public:
@ -229,25 +261,25 @@ public:
! candidate_is_array
? sizeof(ValueType) : 0 };
KOKKOS_FORCEINLINE_FUNCTION static
KOKKOS_FORCEINLINE_FUNCTION static constexpr
unsigned value_count( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f); }
KOKKOS_FORCEINLINE_FUNCTION static
KOKKOS_FORCEINLINE_FUNCTION static constexpr
unsigned value_size( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
//----------------------------------------
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
KOKKOS_FORCEINLINE_FUNCTION static constexpr
unsigned value_count( const Unknown & )
{ return 1 ; }
{ return candidate_is_void ? 0 : 1 ; }
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
KOKKOS_FORCEINLINE_FUNCTION static constexpr
unsigned value_size( const Unknown & )
{ return sizeof(ValueType); }
{ return candidate_is_void ? 0 : sizeof(ValueType); }
private:
@ -277,16 +309,16 @@ private:
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
void enable_if( void (F::*)( vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
void join( F const * const f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( *dst , *src ); }
{ f->join( *dst , *src ); }
};
template< class F >
@ -296,16 +328,16 @@ private:
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
void enable_if( void (F::*)( vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
void join( F const * const f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( dst , src ); }
{ f->join( dst , src ); }
};
template< class F >
@ -315,22 +347,22 @@ private:
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
void join( F const * const f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , *dst , *src ); }
{ f->join( WTag() , *dst , *src ); }
};
template< class F >
@ -340,22 +372,22 @@ private:
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
void join( F const * const f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , dst , src ); }
{ f->join( WTag() , dst , src ); }
};
@ -364,12 +396,14 @@ private:
, typename = void >
struct DeduceJoin
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static
void join( F const & f
void join( F const * const f
, ValueType volatile * dst
, ValueType volatile const * src )
{
const int n = FunctorAnalysis::value_count( f );
const int n = FunctorAnalysis::value_count( *f );
for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
}
};
@ -377,8 +411,10 @@ private:
template< class F >
struct DeduceJoin< F , DISABLE , void >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static
void join( F const &
void join( F const * const
, ValueType volatile *
, ValueType volatile const * ) {}
};
@ -386,7 +422,8 @@ private:
template< class F , INTERFACE I >
struct DeduceJoin< F , I ,
decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
: public has_join_function<F,I> {};
: public has_join_function<F,I>
{ enum { value = true }; };
//----------------------------------------
@ -397,38 +434,38 @@ private:
struct has_init_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
void enable_if( void (F::*)( ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( *dst ); }
void init( F const * const f , ValueType * dst )
{ f->init( *dst ); }
};
template< class F >
struct has_init_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
void enable_if( void (F::*)( ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( dst ); }
void init( F const * const f , ValueType * dst )
{ f->init( dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
void enable_if( void (F::*)( WTag , ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
void enable_if( void (F::*)( WTag const & , ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
@ -437,18 +474,18 @@ private:
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), *dst ); }
void init( F const * const f , ValueType * dst )
{ f->init( WTag(), *dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
void enable_if( void (F::*)( WTag , ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
void enable_if( void (F::*)( WTag const & , ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
@ -457,8 +494,8 @@ private:
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), dst ); }
void init( F const * const f , ValueType * dst )
{ f->init( WTag(), dst ); }
};
template< class F = Functor
@ -466,87 +503,29 @@ private:
, typename = void >
struct DeduceInit
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
void init( F const * const , ValueType * dst ) { new(dst) ValueType(); }
};
template< class F >
struct DeduceInit< F , DISABLE , void >
{
enum { value = false };
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * ) {}
void init( F const * const , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceInit< F , I ,
decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
: public has_init_function<F,I> {};
: public has_init_function<F,I>
{ enum { value = true }; };
//----------------------------------------
public:
struct Reducer
{
private:
Functor const & m_functor ;
ValueType * const m_result ;
int const m_length ;
public:
using reducer = Reducer ;
using value_type = FunctorAnalysis::value_type ;
using memory_space = void ;
using reference_type = FunctorAnalysis::reference_type ;
KOKKOS_INLINE_FUNCTION
void join( ValueType volatile * dst
, ValueType volatile const * src ) const noexcept
{ DeduceJoin<>::join( m_functor , dst , src ); }
KOKKOS_INLINE_FUNCTION
void init( ValueType * dst ) const noexcept
{ DeduceInit<>::init( m_functor , dst ); }
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( Functor const & arg_functor
, ValueType * arg_value = 0
, int arg_length = 0 ) noexcept
: m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
KOKKOS_INLINE_FUNCTION
constexpr int length() const noexcept { return m_length ; }
KOKKOS_INLINE_FUNCTION
ValueType & operator[]( int i ) const noexcept
{ return m_result[i]; }
private:
template< bool IsArray >
constexpr
typename std::enable_if< IsArray , ValueType * >::type
ref() const noexcept { return m_result ; }
template< bool IsArray >
constexpr
typename std::enable_if< ! IsArray , ValueType & >::type
ref() const noexcept { return *m_result ; }
public:
KOKKOS_INLINE_FUNCTION
auto result() const noexcept
-> decltype( Reducer::template ref< candidate_is_array >() )
{ return Reducer::template ref< candidate_is_array >(); }
};
//----------------------------------------
private:
template< class , INTERFACE >
struct has_final_function ;
@ -555,14 +534,14 @@ private:
struct has_final_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
void enable_if( void (F::*)( ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( *dst ); }
void final( F const * const f , ValueType * dst )
{ f->final( *dst ); }
};
// No tag, is array
@ -570,14 +549,14 @@ private:
struct has_final_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
void enable_if( void (F::*)( ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( dst ); }
void final( F const * const f , ValueType * dst )
{ f->final( dst ); }
};
// Has tag, not array
@ -585,10 +564,10 @@ private:
struct has_final_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
void enable_if( void (F::*)( WTag , ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
void enable_if( void (F::*)( WTag const & , ValueType & ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
@ -597,8 +576,8 @@ private:
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), *dst ); }
void final( F const * const f , ValueType * dst )
{ f->final( WTag(), *dst ); }
};
// Has tag, is array
@ -606,10 +585,10 @@ private:
struct has_final_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
void enable_if( void (F::*)( WTag , ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
void enable_if( void (F::*)( WTag const & , ValueType * ) const );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
@ -618,8 +597,8 @@ private:
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), dst ); }
void final( F const * const f , ValueType * dst )
{ f->final( WTag(), dst ); }
};
template< class F = Functor
@ -627,20 +606,139 @@ private:
, typename = void >
struct DeduceFinal
{
enum { value = false };
KOKKOS_INLINE_FUNCTION
static void final( F const & , ValueType * ) {}
static void final( F const * const , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceFinal< F , I ,
decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
: public has_init_function<F,I> {};
: public has_final_function<F,I>
{ enum { value = true }; };
//----------------------------------------
template< class F = Functor , typename = void >
struct DeduceTeamShmem
{
enum { value = false };
static size_t team_shmem_size( F const & , int ) { return 0 ; }
};
template< class F >
struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::team_shmem_size ) >::type >
{
enum { value = true };
static size_t team_shmem_size( F const * const f , int team_size )
{ return f->team_shmem_size( team_size ); }
};
template< class F >
struct DeduceTeamShmem< F , typename std::enable_if< 0 < sizeof( & F::shmem_size ) >::type >
{
enum { value = true };
static size_t team_shmem_size( F const * const f , int team_size )
{ return f->shmem_size( team_size ); }
};
//----------------------------------------
public:
static void final( Functor const & f , ValueType * result )
{ DeduceFinal<>::final( f , result ); }
inline static
size_t team_shmem_size( Functor const & f )
{ return DeduceTeamShmem<>::team_shmem_size( f ); }
//----------------------------------------
enum { has_join_member_function = DeduceJoin<>::value };
enum { has_init_member_function = DeduceInit<>::value };
enum { has_final_member_function = DeduceFinal<>::value };
template< class MemorySpace = typename execution_space::memory_space >
struct Reducer
{
private:
Functor const * const m_functor ;
ValueType * const m_result ;
template< bool IsArray >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< IsArray , FunctorAnalysis::ValueType * >::type
ref() const noexcept { return m_result ; }
template< bool IsArray >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< ! IsArray , FunctorAnalysis::ValueType & >::type
ref() const noexcept { return *m_result ; }
template< bool IsArray >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< IsArray , int >::type
len() const noexcept { return m_functor->value_count ; }
template< bool IsArray >
KOKKOS_INLINE_FUNCTION constexpr
typename std::enable_if< ! IsArray , int >::type
len() const noexcept { return candidate_is_void ? 0 : 1 ; }
public:
using reducer = Reducer ;
using value_type = FunctorAnalysis::value_type ;
using memory_space = MemorySpace ;
using reference_type = FunctorAnalysis::reference_type ;
using functor_type = Functor ; // Adapts a functor
KOKKOS_INLINE_FUNCTION constexpr
value_type * data() const noexcept { return m_result ; }
KOKKOS_INLINE_FUNCTION constexpr
reference_type reference() const noexcept
{ return Reducer::template ref< candidate_is_array >(); }
KOKKOS_INLINE_FUNCTION constexpr
int length() const noexcept
{ return Reducer::template len< candidate_is_array >(); }
KOKKOS_INLINE_FUNCTION
void copy( ValueType * const dst
, ValueType const * const src ) const noexcept
{ for ( int i = 0 ; i < Reducer::template len< candidate_is_array >() ; ++i ) dst[i] = src[i] ; }
KOKKOS_INLINE_FUNCTION
void join( ValueType volatile * dst
, ValueType volatile const * src ) const noexcept
{ DeduceJoin<>::join( m_functor , dst , src ); }
KOKKOS_INLINE_FUNCTION
void init( ValueType * dst ) const noexcept
{ DeduceInit<>::init( m_functor , dst ); }
KOKKOS_INLINE_FUNCTION
void final( ValueType * dst ) const noexcept
{ DeduceFinal<>::final( m_functor , dst ); }
Reducer( Reducer const & ) = default ;
Reducer( Reducer && ) = default ;
Reducer & operator = ( Reducer const & ) = delete ;
Reducer & operator = ( Reducer && ) = delete ;
template< class S >
using rebind = Reducer< S > ;
KOKKOS_INLINE_FUNCTION explicit constexpr
Reducer( Functor const * arg_functor = 0
, ValueType * arg_value = 0 ) noexcept
: m_functor(arg_functor), m_result(arg_value) {}
};
};
} // namespace Impl

View File

@ -44,11 +44,10 @@
#include <Kokkos_Macros.hpp>
#include <stddef.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
#include <cstddef>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <sstream>
@ -397,3 +396,4 @@ void unlock_address_hbw_space(void* ptr) {
}
}
#endif

View File

@ -46,6 +46,7 @@
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
/*--------------------------------------------------------------------------*/
#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_ENABLE_CUDA )
@ -85,10 +86,10 @@
/*--------------------------------------------------------------------------*/
#include <stddef.h>
#include <stdlib.h>
#include <stdint.h>
#include <memory.h>
#include <cstddef>
#include <cstdlib>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <sstream>
@ -98,6 +99,10 @@
#include <impl/Kokkos_Error.hpp>
#include <Kokkos_Atomic.hpp>
#if ( defined( KOKKOS_ENABLE_ASM ) || defined ( KOKKOS_ENABLE_TM ) ) && defined ( KOKKOS_ENABLE_ISA_X86_64 )
#include <immintrin.h>
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -490,16 +495,50 @@ void init_lock_array_host_space() {
}
bool lock_address_host_space(void* ptr) {
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
const unsigned status = _xbegin();
if( _XBEGIN_STARTED == status ) {
const int val = HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK];
if( 0 == val ) {
HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 1;
} else {
_xabort( 1 );
}
_xend();
return 1;
} else {
#endif
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0 , 1);
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
}
#endif
}
void unlock_address_host_space(void* ptr) {
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
const unsigned status = _xbegin();
if( _XBEGIN_STARTED == status ) {
HOST_SPACE_ATOMIC_LOCKS[(( size_t(ptr) >> 2 ) &
HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] = 0;
} else {
#endif
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
0);
#if defined( KOKKOS_ENABLE_ISA_X86_64 ) && defined ( KOKKOS_ENABLE_TM )
}
#endif
}
}
}

View File

@ -49,7 +49,6 @@
#include <Kokkos_Atomic.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Reducer.hpp>
#include <impl/Kokkos_FunctorAnalysis.hpp>
//----------------------------------------------------------------------------
@ -507,8 +506,9 @@ public:
const scratch_memory_space & thread_scratch(int) const
{ return m_scratch.set_team_thread_mode(0,m_data.m_team_size,m_data.m_team_rank); }
//----------------------------------------
//--------------------------------------------------------------------------
// Team collectives
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION void team_barrier() const noexcept
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
@ -519,22 +519,6 @@ public:
{}
#endif
template< class Closure >
KOKKOS_INLINE_FUNCTION
void team_barrier( Closure const & f ) const noexcept
{
if ( m_data.team_rendezvous() ) {
// All threads have entered 'team_rendezvous'
// only this thread returned from 'team_rendezvous'
// with a return value of 'true'
f();
m_data.team_rendezvous_release();
}
}
//--------------------------------------------------------------------------
template< typename T >
@ -613,8 +597,9 @@ public:
if ( 0 != m_data.m_team_rank ) {
// Non-root copies to their local buffer:
reducer.copy( (value_type*) m_data.team_reduce_local()
, reducer.data() );
/*reducer.copy( (value_type*) m_data.team_reduce_local()
, reducer.data() );*/
*((value_type*) m_data.team_reduce_local()) = reducer.reference();
}
// Root does not overwrite shared memory until all threads arrive
@ -630,19 +615,19 @@ public:
value_type * const src =
(value_type*) m_data.team_member(i)->team_reduce_local();
reducer.join( reducer.data() , src );
reducer.join( reducer.reference(), *src);
}
// Copy result to root member's buffer:
reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
// reducer.copy( (value_type*) m_data.team_reduce() , reducer.data() );
*((value_type*) m_data.team_reduce()) = reducer.reference();
m_data.team_rendezvous_release();
// This thread released all other threads from 'team_rendezvous'
// with a return value of 'false'
}
else {
// Copy from root member's buffer:
reducer.copy( reducer.data() , (value_type*) m_data.team_reduce() );
reducer.reference() = *((value_type*) m_data.team_reduce());
}
}
}
@ -652,7 +637,7 @@ public:
//--------------------------------------------------------------------------
template< typename ValueType , class JoinOp >
/*template< typename ValueType , class JoinOp >
KOKKOS_INLINE_FUNCTION
ValueType
team_reduce( ValueType const & value
@ -696,7 +681,7 @@ public:
}
#else
{ Kokkos::abort("HostThreadTeamMember team_reduce\n"); return ValueType(); }
#endif
#endif*/
template< typename T >
@ -854,7 +839,7 @@ parallel_reduce
, Reducer const & reducer
)
{
reducer.init( reducer.data() );
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start
; i < loop_boundaries.end
@ -875,9 +860,9 @@ parallel_reduce
, ValueType & result
)
{
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > > reducer( & result );
Kokkos::Experimental::Sum<ValueType> reducer( result );
reducer.init( reducer.data() );
reducer.init( result );
for( iType i = loop_boundaries.start
; i < loop_boundaries.end
@ -888,7 +873,7 @@ parallel_reduce
loop_boundaries.thread.team_reduce( reducer );
}
template< typename iType, class Space
/*template< typename iType, class Space
, class Closure, class Joiner , typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
@ -910,7 +895,7 @@ void parallel_reduce
}
loop_boundaries.thread.team_reduce( reducer );
}
}*/
//----------------------------------------------------------------------------
/** \brief Inter-thread vector parallel_reduce.
@ -923,15 +908,13 @@ void parallel_reduce
*/
template< typename iType, class Space , class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
typename std::enable_if< ! Kokkos::is_reducer<ValueType>::value >::type
parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
const Lambda & lambda,
ValueType& result)
{
result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start ;
i < loop_boundaries.end ;
i += loop_boundaries.increment) {
@ -939,6 +922,22 @@ void parallel_reduce
}
}
template< typename iType, class Space , class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::HostThreadTeamMember<Space> >& loop_boundaries,
const Lambda & lambda,
const ReducerType& reducer)
{
reducer.init(reducer.reference());
for( iType i = loop_boundaries.start ;
i < loop_boundaries.end ;
i += loop_boundaries.increment) {
lambda(i,reducer.reference());
}
}
/** \brief Intra-thread vector parallel_reduce.
*
* Executes lambda(iType i, ValueType & val) for each i=[0..N)
@ -961,9 +960,6 @@ void parallel_reduce
const JoinType & join,
ValueType& result)
{
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start ;
i < loop_boundaries.end ;
i += loop_boundaries.increment ) {
@ -1055,9 +1051,8 @@ template< class Space , class FunctorType >
KOKKOS_INLINE_FUNCTION
void single( const Impl::ThreadSingleStruct< Impl::HostThreadTeamMember<Space> > & single , const FunctorType & functor )
{
if ( single.team_member.team_rank() == 0 ) functor();
// 'single' does not perform a barrier.
// single.team_member.team_barrier( functor );
if ( single.team_member.team_rank() == 0 ) functor();
}
template< class Space , class FunctorType , typename ValueType >

View File

@ -41,6 +41,7 @@
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
#define KOKKOS_MEMORY_FENCE_HPP
namespace Kokkos {
@ -108,4 +109,3 @@ void load_fence()
#endif

View File

@ -212,6 +212,12 @@
#endif
#endif
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifndef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
#define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#endif
#endif
#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
#ifndef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT KOKKOS_HAVE_PRAGMA_LOOPCOUNT
@ -423,6 +429,12 @@
#endif
#endif
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
#ifndef KOKKOS_ENABLE_THREADS
#define KOKKOS_ENABLE_THREADS
#endif
#endif
//------------------------------------------------------------------------------
// Deprecated macros
//------------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -44,13 +44,11 @@
#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
#define KOKKOS_PHYSICAL_LAYOUT_HPP
#include <Kokkos_View.hpp>
namespace Kokkos {
namespace Impl {
struct PhysicalLayout {
enum LayoutType {Left,Right,Scalar,Error};
LayoutType layout_type;
@ -71,3 +69,4 @@ struct PhysicalLayout {
}
}
#endif

View File

@ -44,12 +44,14 @@
#ifndef KOKKOSP_DEVICE_INFO_HPP
#define KOKKOSP_DEVICE_INFO_HPP
#include <cstdint>
namespace Kokkos {
namespace Profiling {
struct KokkosPDeviceInfo {
uint32_t deviceID;
};
struct KokkosPDeviceInfo {
uint32_t deviceID;
};
}
}

View File

@ -41,197 +41,203 @@
//@HEADER
*/
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <string.h>
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <cstring>
namespace Kokkos {
namespace Profiling {
namespace Profiling {
SpaceHandle::SpaceHandle(const char* space_name) {
strncpy(name,space_name,64);
}
SpaceHandle::SpaceHandle(const char* space_name) {
strncpy(name,space_name,64);
}
bool profileLibraryLoaded() {
return (NULL != initProfileLibrary);
}
bool profileLibraryLoaded() {
return (NULL != initProfileLibrary);
}
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginForCallee) {
Kokkos::fence();
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelFor(const uint64_t kernelID) {
if(NULL != endForCallee) {
Kokkos::fence();
(*endForCallee)(kernelID);
}
}
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginScanCallee) {
Kokkos::fence();
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelScan(const uint64_t kernelID) {
if(NULL != endScanCallee) {
Kokkos::fence();
(*endScanCallee)(kernelID);
}
}
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginReduceCallee) {
Kokkos::fence();
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelReduce(const uint64_t kernelID) {
if(NULL != endReduceCallee) {
Kokkos::fence();
(*endReduceCallee)(kernelID);
}
}
void pushRegion(const std::string& kName) {
if( NULL != pushRegionCallee ) {
Kokkos::fence();
(*pushRegionCallee)(kName.c_str());
}
}
void popRegion() {
if( NULL != popRegionCallee ) {
Kokkos::fence();
(*popRegionCallee)();
}
}
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
(*allocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void initialize() {
// Make sure initialize calls happens only once
static int is_initialized = 0;
if(is_initialized) return;
is_initialized = 1;
void* firstProfileLibrary;
char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
// If we do not find a profiling library in the environment then exit
// early.
if( NULL == envProfileLibrary ) {
return ;
}
char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
sprintf(envProfileCopy, "%s", envProfileLibrary);
char* profileLibraryName = strtok(envProfileCopy, ";");
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
if(NULL == firstProfileLibrary) {
std::cerr << "Error: Unable to load KokkosP library: " <<
profileLibraryName << std::endl;
} else {
std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
// dlsym returns a pointer to an object, while we want to assign to pointer to function
// A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
beginForCallee = *((beginFunction*) &p1);
auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
beginScanCallee = *((beginFunction*) &p2);
auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
beginReduceCallee = *((beginFunction*) &p3);
auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
endScanCallee = *((endFunction*) &p4);
auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
endForCallee = *((endFunction*) &p5);
auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
endReduceCallee = *((endFunction*) &p6);
auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
initProfileLibrary = *((initFunction*) &p7);
auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
finalizeProfileLibrary = *((finalizeFunction*) &p8);
auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
pushRegionCallee = *((pushFunction*) &p9);
auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
popRegionCallee = *((popFunction*) &p10);
auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
allocateDataCallee = *((allocateDataFunction*) &p11);
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
deallocateDataCallee = *((deallocateDataFunction*) &p12);
}
}
if(NULL != initProfileLibrary) {
(*initProfileLibrary)(0,
(uint64_t) KOKKOSP_INTERFACE_VERSION,
(uint32_t) 0,
NULL);
}
free(envProfileCopy);
}
void finalize() {
// Make sure finalize calls happens only once
static int is_finalized = 0;
if(is_finalized) return;
is_finalized = 1;
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
// Set all profile hooks to NULL to prevent
// any additional calls. Once we are told to
// finalize, we mean it
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
pushRegionCallee = NULL;
popRegionCallee = NULL;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
}
}
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginForCallee) {
Kokkos::fence();
(*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelFor(const uint64_t kernelID) {
if(NULL != endForCallee) {
Kokkos::fence();
(*endForCallee)(kernelID);
}
}
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginScanCallee) {
Kokkos::fence();
(*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelScan(const uint64_t kernelID) {
if(NULL != endScanCallee) {
Kokkos::fence();
(*endScanCallee)(kernelID);
}
}
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
if(NULL != beginReduceCallee) {
Kokkos::fence();
(*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
}
}
void endParallelReduce(const uint64_t kernelID) {
if(NULL != endReduceCallee) {
Kokkos::fence();
(*endReduceCallee)(kernelID);
}
}
void pushRegion(const std::string& kName) {
if( NULL != pushRegionCallee ) {
Kokkos::fence();
(*pushRegionCallee)(kName.c_str());
}
}
void popRegion() {
if( NULL != popRegionCallee ) {
Kokkos::fence();
(*popRegionCallee)();
}
}
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
(*allocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size) {
if(NULL != allocateDataCallee) {
(*deallocateDataCallee)(space,label.c_str(),ptr,size);
}
}
void initialize() {
// Make sure initialize calls happens only once
static int is_initialized = 0;
if(is_initialized) return;
is_initialized = 1;
void* firstProfileLibrary;
char* envProfileLibrary = getenv("KOKKOS_PROFILE_LIBRARY");
// If we do not find a profiling library in the environment then exit
// early.
if( NULL == envProfileLibrary ) {
return ;
}
char* envProfileCopy = (char*) malloc(sizeof(char) * (strlen(envProfileLibrary) + 1));
sprintf(envProfileCopy, "%s", envProfileLibrary);
char* profileLibraryName = strtok(envProfileCopy, ";");
if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
if(NULL == firstProfileLibrary) {
std::cerr << "Error: Unable to load KokkosP library: " <<
profileLibraryName << std::endl;
} else {
#ifdef KOKKOS_ENABLE_PROFILING_LOAD_PRINT
std::cout << "KokkosP: Library Loaded: " << profileLibraryName << std::endl;
#endif
// dlsym returns a pointer to an object, while we want to assign to pointer to function
// A direct cast will give warnings hence, we have to workaround the issue by casting pointer to pointers.
auto p1 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
beginForCallee = *((beginFunction*) &p1);
auto p2 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
beginScanCallee = *((beginFunction*) &p2);
auto p3 = dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
beginReduceCallee = *((beginFunction*) &p3);
auto p4 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
endScanCallee = *((endFunction*) &p4);
auto p5 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
endForCallee = *((endFunction*) &p5);
auto p6 = dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
endReduceCallee = *((endFunction*) &p6);
auto p7 = dlsym(firstProfileLibrary, "kokkosp_init_library");
initProfileLibrary = *((initFunction*) &p7);
auto p8 = dlsym(firstProfileLibrary, "kokkosp_finalize_library");
finalizeProfileLibrary = *((finalizeFunction*) &p8);
auto p9 = dlsym(firstProfileLibrary, "kokkosp_push_profile_region");
pushRegionCallee = *((pushFunction*) &p9);
auto p10 = dlsym(firstProfileLibrary, "kokkosp_pop_profile_region");
popRegionCallee = *((popFunction*) &p10);
auto p11 = dlsym(firstProfileLibrary, "kokkosp_allocate_data");
allocateDataCallee = *((allocateDataFunction*) &p11);
auto p12 = dlsym(firstProfileLibrary, "kokkosp_deallocate_data");
deallocateDataCallee = *((deallocateDataFunction*) &p12);
}
}
if(NULL != initProfileLibrary) {
(*initProfileLibrary)(0,
(uint64_t) KOKKOSP_INTERFACE_VERSION,
(uint32_t) 0,
NULL);
}
free(envProfileCopy);
}
void finalize() {
// Make sure finalize calls happens only once
static int is_finalized = 0;
if(is_finalized) return;
is_finalized = 1;
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
// Set all profile hooks to NULL to prevent
// any additional calls. Once we are told to
// finalize, we mean it
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
pushRegionCallee = NULL;
popRegionCallee = NULL;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
}
}
}
}
#else
void KOKKOS_CORE_SRC_IMPL_PROFILING_INTERFACE_PREVENT_LINK_ERROR() {}
#endif

View File

@ -44,108 +44,108 @@
#ifndef KOKKOSP_INTERFACE_HPP
#define KOKKOSP_INTERFACE_HPP
#include <Kokkos_Macros.hpp>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <cstddef>
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Macros.hpp>
#include <string>
#include <cinttypes>
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
#include <dlfcn.h>
#include <iostream>
#include <stdlib.h>
#endif
#include <cstdlib>
#define KOKKOSP_INTERFACE_VERSION 20150628
#if defined(KOKKOS_ENABLE_PROFILING)
namespace Kokkos {
namespace Profiling {
namespace Profiling {
struct SpaceHandle {
SpaceHandle(const char* space_name);
char name[64];
};
struct SpaceHandle {
SpaceHandle(const char* space_name);
char name[64];
};
typedef void (*initFunction)(const int,
const uint64_t,
const uint32_t,
KokkosPDeviceInfo*);
typedef void (*finalizeFunction)();
typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
typedef void (*endFunction)(uint64_t);
typedef void (*initFunction)(const int,
const uint64_t,
const uint32_t,
KokkosPDeviceInfo*);
typedef void (*finalizeFunction)();
typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
typedef void (*endFunction)(uint64_t);
typedef void (*pushFunction)(const char*);
typedef void (*popFunction)();
typedef void (*pushFunction)(const char*);
typedef void (*popFunction)();
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t);
static initFunction initProfileLibrary = NULL;
static finalizeFunction finalizeProfileLibrary = NULL;
static initFunction initProfileLibrary = NULL;
static finalizeFunction finalizeProfileLibrary = NULL;
static beginFunction beginForCallee = NULL;
static beginFunction beginScanCallee = NULL;
static beginFunction beginReduceCallee = NULL;
static endFunction endForCallee = NULL;
static endFunction endScanCallee = NULL;
static endFunction endReduceCallee = NULL;
static beginFunction beginForCallee = NULL;
static beginFunction beginScanCallee = NULL;
static beginFunction beginReduceCallee = NULL;
static endFunction endForCallee = NULL;
static endFunction endScanCallee = NULL;
static endFunction endReduceCallee = NULL;
static pushFunction pushRegionCallee = NULL;
static popFunction popRegionCallee = NULL;
static pushFunction pushRegionCallee = NULL;
static popFunction popRegionCallee = NULL;
static allocateDataFunction allocateDataCallee = NULL;
static deallocateDataFunction deallocateDataCallee = NULL;
static allocateDataFunction allocateDataCallee = NULL;
static deallocateDataFunction deallocateDataCallee = NULL;
bool profileLibraryLoaded();
bool profileLibraryLoaded();
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelFor(const uint64_t kernelID);
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelScan(const uint64_t kernelID);
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelReduce(const uint64_t kernelID);
void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelFor(const uint64_t kernelID);
void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelScan(const uint64_t kernelID);
void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
void endParallelReduce(const uint64_t kernelID);
void pushRegion(const std::string& kName);
void popRegion();
void pushRegion(const std::string& kName);
void popRegion();
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size);
void initialize();
void finalize();
void initialize();
void finalize();
//Define finalize_fake inline to get rid of warnings for unused static variables
inline void finalize_fake() {
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
//Define finalize_fake inline to get rid of warnings for unused static variables
inline void finalize_fake() {
if(NULL != finalizeProfileLibrary) {
(*finalizeProfileLibrary)();
// Set all profile hooks to NULL to prevent
// any additional calls. Once we are told to
// finalize, we mean it
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
pushRegionCallee = NULL;
popRegionCallee = NULL;
}
}
// Set all profile hooks to NULL to prevent
// any additional calls. Once we are told to
// finalize, we mean it
beginForCallee = NULL;
beginScanCallee = NULL;
beginReduceCallee = NULL;
endScanCallee = NULL;
endForCallee = NULL;
endReduceCallee = NULL;
allocateDataCallee = NULL;
deallocateDataCallee = NULL;
initProfileLibrary = NULL;
finalizeProfileLibrary = NULL;
pushRegionCallee = NULL;
popRegionCallee = NULL;
}
}
}
}
#endif
#endif

Some files were not shown because too many files have changed in this diff Show More