Updating Kokkos lib to v2.03.05
This commit is contained in:
@ -1,15 +1,38 @@
|
||||
#ifndef KOKKOS_CORE_CONFIG_H
|
||||
#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
|
||||
#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
|
||||
#else
|
||||
#define KOKKOS_CORE_CONFIG_H
|
||||
#endif
|
||||
|
||||
/* The trivial 'src/build_common.sh' creates a config
|
||||
* that must stay in sync with this file.
|
||||
*/
|
||||
#cmakedefine KOKKOS_FOR_SIERRA
|
||||
|
||||
#if !defined( KOKKOS_FOR_SIERRA )
|
||||
#ifndef KOKKOS_FOR_SIERRA
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_MPI
|
||||
#cmakedefine KOKKOS_HAVE_CUDA
|
||||
#cmakedefine KOKKOS_HAVE_OPENMP
|
||||
#cmakedefine KOKKOS_HAVE_PTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_QTHREADS
|
||||
#cmakedefine KOKKOS_HAVE_SERIAL
|
||||
#cmakedefine KOKKOS_HAVE_Winthread
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_HWLOC
|
||||
#cmakedefine KOKKOS_ENABLE_HBWSPACE
|
||||
#cmakedefine KOKKOS_ENABLE_LIBRT
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_DEBUG
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_LOAD_PRINT
|
||||
|
||||
#cmakedefine KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
#cmakedefine KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
|
||||
|
||||
// mfh 16 Sep 2014: If passed in on the command line, that overrides
|
||||
// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build
|
||||
@ -23,26 +46,8 @@
|
||||
// hesitate to do that now, because I'm not sure if all the files are
|
||||
// including KokkosCore_config.h (or a header file that includes it) like
|
||||
// they should.
|
||||
|
||||
#if ! defined(KOKKOS_USE_CUDA_UVM)
|
||||
#ifndef KOKKOS_USE_CUDA_UVM
|
||||
#cmakedefine KOKKOS_USE_CUDA_UVM
|
||||
#endif // ! defined(KOKKOS_USE_CUDA_UVM)
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_PTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_SERIAL
|
||||
#cmakedefine KOKKOS_HAVE_QTHREADS
|
||||
#cmakedefine KOKKOS_HAVE_Winthread
|
||||
#cmakedefine KOKKOS_HAVE_OPENMP
|
||||
#cmakedefine KOKKOS_HAVE_HWLOC
|
||||
#cmakedefine KOKKOS_HAVE_DEBUG
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_HAVE_CXX11
|
||||
#cmakedefine KOKKOS_HAVE_CUSPARSE
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_INTERNAL
|
||||
#ifdef KOKKOS_ENABLE_PROFILING_INTERNAL
|
||||
#define KOKKOS_ENABLE_PROFILING 1
|
||||
#else
|
||||
#define KOKKOS_ENABLE_PROFILING 0
|
||||
#endif
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_CUDA_RDC
|
||||
@ -55,13 +60,51 @@
|
||||
#define KOKKOS_CUDA_USE_LAMBDA 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#cmakedefine KOKKOS_CUDA_CLANG_WORKAROUND
|
||||
|
||||
#ifndef __CUDA_ARCH__
|
||||
#cmakedefine KOKKOS_ENABLE_ISA_X86_64
|
||||
#cmakedefine KOKKOS_ENABLE_ISA_KNC
|
||||
#cmakedefine KOKKOS_ENABLE_ISA_POWERPCLE
|
||||
#endif
|
||||
|
||||
#cmakedefine KOKKOS_ARCH_ARMV80 1
|
||||
#cmakedefine KOKKOS_ARCH_ARMV81 1
|
||||
#cmakedefine KOKKOS_ARCH_ARMV8_THUNDERX 1
|
||||
#cmakedefine KOKKOS_ARCH_AVX 1
|
||||
#cmakedefine KOKKOS_ARCH_AVX2 1
|
||||
#cmakedefine KOKKOS_ARCH_AVX512MIC 1
|
||||
#cmakedefine KOKKOS_ARCH_AVX512XEON 1
|
||||
#cmakedefine KOKKOS_ARCH_KNC 1
|
||||
#cmakedefine KOKKOS_ARCH_POWER8 1
|
||||
#cmakedefine KOKKOS_ARCH_POWER9 1
|
||||
#cmakedefine KOKKOS_ARCH_KEPLER 1
|
||||
#cmakedefine KOKKOS_ARCH_KEPLER30 1
|
||||
#cmakedefine KOKKOS_ARCH_KEPLER32 1
|
||||
#cmakedefine KOKKOS_ARCH_KEPLER35 1
|
||||
#cmakedefine KOKKOS_ARCH_KEPLER37 1
|
||||
#cmakedefine KOKKOS_ARCH_MAXWELL 1
|
||||
#cmakedefine KOKKOS_ARCH_MAXWELL50 1
|
||||
#cmakedefine KOKKOS_ARCH_MAXWELL52 1
|
||||
#cmakedefine KOKKOS_ARCH_MAXWELL53 1
|
||||
#cmakedefine KOKKOS_ARCH_PASCAL 1
|
||||
#cmakedefine KOKKOS_ARCH_PASCAL60 1
|
||||
#cmakedefine KOKKOS_ARCH_PASCAL61 1
|
||||
|
||||
// Don't forbid users from defining this macro on the command line,
|
||||
// but still make sure that CMake logic can control its definition.
|
||||
#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
|
||||
#ifndef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
#cmakedefine KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif // KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
#endif
|
||||
|
||||
// TODO: These are currently not used in Kokkos. Should they be removed?
|
||||
#cmakedefine KOKKOS_HAVE_MPI
|
||||
#cmakedefine KOKKOS_HAVE_CUSPARSE
|
||||
|
||||
// TODO: No longer options in Kokkos. Need to be removed.
|
||||
#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
|
||||
#cmakedefine KOKKOS_HAVE_CXX11
|
||||
|
||||
#endif // KOKKOS_FOR_SIERRA
|
||||
#endif // KOKKOS_CORE_CONFIG_H
|
||||
|
||||
@ -1,11 +1,18 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
# warning: PerfTest_CustomReduction.cpp uses
|
||||
# ../../algorithms/src/Kokkos_Random.hpp
|
||||
# we'll just allow it to be included, but note
|
||||
# that in TriBITS KokkosAlgorithms can be disabled...
|
||||
INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src")
|
||||
|
||||
SET(SOURCES
|
||||
PerfTestMain.cpp
|
||||
PerfTestHost.cpp
|
||||
PerfTestCuda.cpp
|
||||
PerfTestGramSchmidt.cpp
|
||||
PerfTestHexGrad.cpp
|
||||
PerfTest_CustomReduction.cpp
|
||||
)
|
||||
|
||||
# Per #374, we always want to build this test, but we only want to run
|
||||
|
||||
@ -8,12 +8,14 @@ default: build_all
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
|
||||
KOKKOS_CUDA_OPTIONS=enable_lambda
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
CXXFLAGS = -O3
|
||||
#CXXFLAGS += -DGENERIC_REDUCER
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
|
||||
@ -21,23 +23,49 @@ include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
|
||||
|
||||
TEST_TARGETS =
|
||||
TARGETS =
|
||||
TEST_TARGETS =
|
||||
TARGETS =
|
||||
|
||||
OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o
|
||||
#
|
||||
|
||||
OBJ_PERF = PerfTestMain.o gtest-all.o
|
||||
OBJ_PERF += PerfTestGramSchmidt.o
|
||||
OBJ_PERF += PerfTestHexGrad.o
|
||||
OBJ_PERF += PerfTest_CustomReduction.o
|
||||
TARGETS += KokkosCore_PerformanceTest
|
||||
TEST_TARGETS += test-performance
|
||||
|
||||
#
|
||||
|
||||
OBJ_ATOMICS = test_atomic.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Atomics
|
||||
TEST_TARGETS += test-atomic
|
||||
|
||||
#
|
||||
|
||||
OBJ_MEMPOOL = test_mempool.o
|
||||
TARGETS += KokkosCore_PerformanceTest_Mempool
|
||||
TEST_TARGETS += test-mempool
|
||||
|
||||
#
|
||||
|
||||
OBJ_TASKDAG = test_taskdag.o
|
||||
TARGETS += KokkosCore_PerformanceTest_TaskDAG
|
||||
TEST_TARGETS += test-taskdag
|
||||
|
||||
#
|
||||
|
||||
KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest
|
||||
|
||||
KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics
|
||||
$(LINK) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_PerformanceTest_Atomics
|
||||
|
||||
KokkosCore_PerformanceTest_Mempool: $(OBJ_MEMPOOL) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_MEMPOOL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Mempool
|
||||
|
||||
KokkosCore_PerformanceTest_TaskDAG: $(OBJ_TASKDAG) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_TASKDAG) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_TaskDAG
|
||||
|
||||
test-performance: KokkosCore_PerformanceTest
|
||||
./KokkosCore_PerformanceTest
|
||||
@ -45,12 +73,18 @@ test-performance: KokkosCore_PerformanceTest
|
||||
test-atomic: KokkosCore_PerformanceTest_Atomics
|
||||
./KokkosCore_PerformanceTest_Atomics
|
||||
|
||||
test-mempool: KokkosCore_PerformanceTest_Mempool
|
||||
./KokkosCore_PerformanceTest_Mempool
|
||||
|
||||
test-taskdag: KokkosCore_PerformanceTest_TaskDAG
|
||||
./KokkosCore_PerformanceTest_TaskDAG
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
clean: kokkos-clean
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(TARGETS)
|
||||
|
||||
# Compilation rules
|
||||
@ -58,5 +92,6 @@ clean: kokkos-clean
|
||||
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
|
||||
|
||||
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
|
||||
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
|
||||
|
||||
|
||||
@ -1,199 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <algorithm>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
#include <PerfTestMDRange.hpp>
|
||||
|
||||
#include <PerfTestHexGrad.hpp>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
#include <PerfTestGramSchmidt.hpp>
|
||||
#include <PerfTestDriver.hpp>
|
||||
|
||||
|
||||
namespace Test {
|
||||
|
||||
class cuda : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase() {
|
||||
Kokkos::HostSpace::execution_space::initialize();
|
||||
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
|
||||
}
|
||||
static void TearDownTestCase() {
|
||||
Kokkos::Cuda::finalize();
|
||||
Kokkos::HostSpace::execution_space::finalize();
|
||||
}
|
||||
};
|
||||
|
||||
//TEST_F( cuda, mdrange_lr ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
|
||||
//}
|
||||
|
||||
//TEST_F( cuda, mdrange_ll ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
|
||||
//}
|
||||
|
||||
TEST_F( cuda, hexgrad )
|
||||
{
|
||||
EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
|
||||
}
|
||||
|
||||
TEST_F( cuda, gramschmidt )
|
||||
{
|
||||
EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
template <typename T>
|
||||
struct TextureFetch
|
||||
{
|
||||
typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type;
|
||||
typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type;
|
||||
typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type;
|
||||
typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type;
|
||||
|
||||
struct FillArray
|
||||
{
|
||||
array_type m_array;
|
||||
FillArray( const array_type & array )
|
||||
: m_array(array)
|
||||
{}
|
||||
|
||||
void apply() const
|
||||
{
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.dimension_0()), *this);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const { m_array(i) = i; }
|
||||
};
|
||||
|
||||
struct RandomIndexes
|
||||
{
|
||||
index_array_type m_indexes;
|
||||
typename index_array_type::HostMirror m_host_indexes;
|
||||
RandomIndexes( const index_array_type & indexes)
|
||||
: m_indexes(indexes)
|
||||
, m_host_indexes(Kokkos::create_mirror(m_indexes))
|
||||
{}
|
||||
|
||||
void apply() const
|
||||
{
|
||||
Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.dimension_0()), *this);
|
||||
//random shuffle
|
||||
Kokkos::HostSpace::execution_space::fence();
|
||||
std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.dimension_0());
|
||||
Kokkos::deep_copy(m_indexes,m_host_indexes);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i) const { m_host_indexes(i) = i; }
|
||||
};
|
||||
|
||||
struct RandomReduce
|
||||
{
|
||||
const_array_type m_array;
|
||||
const_index_array_type m_indexes;
|
||||
RandomReduce( const const_array_type & array, const const_index_array_type & indexes)
|
||||
: m_array(array)
|
||||
, m_indexes(indexes)
|
||||
{}
|
||||
|
||||
void apply(T & reduce) const
|
||||
{
|
||||
Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.dimension_0()), *this, reduce);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(int i, T & reduce) const
|
||||
{ reduce += m_array(m_indexes(i)); }
|
||||
};
|
||||
|
||||
static void run(int size, double & reduce_time, T &reduce)
|
||||
{
|
||||
array_type array("array",size);
|
||||
index_array_type indexes("indexes",size);
|
||||
|
||||
{ FillArray f(array); f.apply(); }
|
||||
{ RandomIndexes f(indexes); f.apply(); }
|
||||
|
||||
Kokkos::Cuda::fence();
|
||||
|
||||
Kokkos::Timer timer;
|
||||
for (int j=0; j<10; ++j) {
|
||||
RandomReduce f(array,indexes);
|
||||
f.apply(reduce);
|
||||
}
|
||||
Kokkos::Cuda::fence();
|
||||
reduce_time = timer.seconds();
|
||||
}
|
||||
};
|
||||
|
||||
} // unnamed namespace
|
||||
|
||||
TEST_F( cuda, texture_double )
|
||||
{
|
||||
printf("Random reduce of double through texture fetch\n");
|
||||
for (int i=1; i<=26; ++i) {
|
||||
int size = 1<<i;
|
||||
double time = 0;
|
||||
double reduce = 0;
|
||||
TextureFetch<double>::run(size,time,reduce);
|
||||
printf(" time = %1.3e size = 2^%d\n", time, i);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
|
||||
|
||||
@ -398,91 +398,5 @@ void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], i
|
||||
}
|
||||
|
||||
|
||||
template< class DeviceType >
|
||||
void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] )
|
||||
{
|
||||
std::string label_hexgrad ;
|
||||
label_hexgrad.append( "\"HexGrad< double , " );
|
||||
// mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
|
||||
// the string, not the actual name of the device type. Thus, I've
|
||||
// modified the function to take the name of the device type.
|
||||
//
|
||||
//label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
|
||||
label_hexgrad.append( deviceTypeName );
|
||||
label_hexgrad.append( " >\"" );
|
||||
|
||||
for (int i = exp_beg ; i < exp_end ; ++i) {
|
||||
double min_seconds = 0.0 ;
|
||||
double max_seconds = 0.0 ;
|
||||
double avg_seconds = 0.0 ;
|
||||
|
||||
const int parallel_work_length = 1<<i;
|
||||
|
||||
for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
|
||||
const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
|
||||
|
||||
if ( 0 == j ) {
|
||||
min_seconds = seconds ;
|
||||
max_seconds = seconds ;
|
||||
}
|
||||
else {
|
||||
if ( seconds < min_seconds ) min_seconds = seconds ;
|
||||
if ( seconds > max_seconds ) max_seconds = seconds ;
|
||||
}
|
||||
avg_seconds += seconds ;
|
||||
}
|
||||
avg_seconds /= NUMBER_OF_TRIALS ;
|
||||
|
||||
std::cout << label_hexgrad
|
||||
<< " , " << parallel_work_length
|
||||
<< " , " << min_seconds
|
||||
<< " , " << ( min_seconds / parallel_work_length )
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
template< class DeviceType >
|
||||
void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] )
|
||||
{
|
||||
std::string label_gramschmidt ;
|
||||
label_gramschmidt.append( "\"GramSchmidt< double , " );
|
||||
// mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
|
||||
// the string, not the actual name of the device type. Thus, I've
|
||||
// modified the function to take the name of the device type.
|
||||
//
|
||||
//label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
|
||||
label_gramschmidt.append( deviceTypeName );
|
||||
label_gramschmidt.append( " >\"" );
|
||||
|
||||
for (int i = exp_beg ; i < exp_end ; ++i) {
|
||||
double min_seconds = 0.0 ;
|
||||
double max_seconds = 0.0 ;
|
||||
double avg_seconds = 0.0 ;
|
||||
|
||||
const int parallel_work_length = 1<<i;
|
||||
|
||||
for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
|
||||
const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
|
||||
|
||||
if ( 0 == j ) {
|
||||
min_seconds = seconds ;
|
||||
max_seconds = seconds ;
|
||||
}
|
||||
else {
|
||||
if ( seconds < min_seconds ) min_seconds = seconds ;
|
||||
if ( seconds > max_seconds ) max_seconds = seconds ;
|
||||
}
|
||||
avg_seconds += seconds ;
|
||||
}
|
||||
avg_seconds /= NUMBER_OF_TRIALS ;
|
||||
|
||||
std::cout << label_gramschmidt
|
||||
<< " , " << parallel_work_length
|
||||
<< " , " << min_seconds
|
||||
<< " , " << ( min_seconds / parallel_work_length )
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
|
||||
#include <cmath>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
|
||||
@ -70,7 +74,7 @@ struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void final( value_type & result ) const
|
||||
{
|
||||
result = sqrt( result );
|
||||
result = std::sqrt( result );
|
||||
Rjj() = result ;
|
||||
inv() = ( 0 < result ) ? 1.0 / result : 0 ;
|
||||
}
|
||||
@ -157,7 +161,7 @@ struct ModifiedGramSchmidt
|
||||
|
||||
for ( size_type j = 0 ; j < count ; ++j ) {
|
||||
// Reduction : tmp = dot( Q(:,j) , Q(:,j) );
|
||||
// PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
|
||||
// PostProcess : tmp = std::sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
|
||||
const vector_type Qj = Kokkos::subview( Q_ , Kokkos::ALL() , j );
|
||||
const value_view Rjj = Kokkos::subview( R_ , j , j );
|
||||
|
||||
@ -222,5 +226,58 @@ struct ModifiedGramSchmidt
|
||||
}
|
||||
};
|
||||
|
||||
template< class DeviceType >
|
||||
void run_test_gramschmidt( int exp_beg , int exp_end, int num_trials, const char deviceTypeName[] )
|
||||
{
|
||||
std::string label_gramschmidt ;
|
||||
label_gramschmidt.append( "\"GramSchmidt< double , " );
|
||||
label_gramschmidt.append( deviceTypeName );
|
||||
label_gramschmidt.append( " >\"" );
|
||||
|
||||
for (int i = exp_beg ; i < exp_end ; ++i) {
|
||||
double min_seconds = 0.0 ;
|
||||
double max_seconds = 0.0 ;
|
||||
double avg_seconds = 0.0 ;
|
||||
|
||||
const int parallel_work_length = 1<<i;
|
||||
|
||||
for ( int j = 0 ; j < num_trials ; ++j ) {
|
||||
const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
|
||||
|
||||
if ( 0 == j ) {
|
||||
min_seconds = seconds ;
|
||||
max_seconds = seconds ;
|
||||
}
|
||||
else {
|
||||
if ( seconds < min_seconds ) min_seconds = seconds ;
|
||||
if ( seconds > max_seconds ) max_seconds = seconds ;
|
||||
}
|
||||
avg_seconds += seconds ;
|
||||
}
|
||||
avg_seconds /= num_trials ;
|
||||
|
||||
std::cout << label_gramschmidt
|
||||
<< " , " << parallel_work_length
|
||||
<< " , " << min_seconds
|
||||
<< " , " << ( min_seconds / parallel_work_length )
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F( default_exec, gramschmidt ) {
|
||||
int exp_beg = 10;
|
||||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if(command_line_num_args()>1)
|
||||
exp_beg = atoi(command_line_arg(1));
|
||||
if(command_line_num_args()>2)
|
||||
exp_end = atoi(command_line_arg(2));
|
||||
if(command_line_num_args()>3)
|
||||
num_trials = atoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_gramschmidt< Kokkos::DefaultExecutionSpace>( exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name() ));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -41,6 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
|
||||
namespace Test {
|
||||
|
||||
template< class DeviceType ,
|
||||
@ -264,5 +268,58 @@ struct HexGrad
|
||||
}
|
||||
};
|
||||
|
||||
template< class DeviceType >
|
||||
void run_test_hexgrad( int exp_beg , int exp_end, int num_trials, const char deviceTypeName[] )
|
||||
{
|
||||
std::string label_hexgrad ;
|
||||
label_hexgrad.append( "\"HexGrad< double , " );
|
||||
label_hexgrad.append( deviceTypeName );
|
||||
label_hexgrad.append( " >\"" );
|
||||
|
||||
for (int i = exp_beg ; i < exp_end ; ++i) {
|
||||
double min_seconds = 0.0 ;
|
||||
double max_seconds = 0.0 ;
|
||||
double avg_seconds = 0.0 ;
|
||||
|
||||
const int parallel_work_length = 1<<i;
|
||||
|
||||
for ( int j = 0 ; j < num_trials ; ++j ) {
|
||||
const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
|
||||
|
||||
if ( 0 == j ) {
|
||||
min_seconds = seconds ;
|
||||
max_seconds = seconds ;
|
||||
}
|
||||
else {
|
||||
if ( seconds < min_seconds ) min_seconds = seconds ;
|
||||
if ( seconds > max_seconds ) max_seconds = seconds ;
|
||||
}
|
||||
avg_seconds += seconds ;
|
||||
}
|
||||
avg_seconds /= num_trials ;
|
||||
|
||||
std::cout << label_hexgrad
|
||||
<< " , " << parallel_work_length
|
||||
<< " , " << min_seconds
|
||||
<< " , " << ( min_seconds / parallel_work_length )
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F( default_exec, hexgrad ) {
|
||||
int exp_beg = 10;
|
||||
int exp_end = 20;
|
||||
int num_trials = 5;
|
||||
|
||||
if(command_line_num_args()>1)
|
||||
exp_beg = atoi(command_line_arg(1));
|
||||
if(command_line_num_args()>2)
|
||||
exp_end = atoi(command_line_arg(2));
|
||||
if(command_line_num_args()>3)
|
||||
num_trials = atoi(command_line_arg(3));
|
||||
|
||||
EXPECT_NO_THROW(run_test_hexgrad< Kokkos::DefaultExecutionSpace >( exp_beg, exp_end, num_trials, Kokkos::DefaultExecutionSpace::name() ));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -1,125 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
typedef Kokkos::OpenMP TestHostDevice ;
|
||||
const char TestHostDeviceName[] = "Kokkos::OpenMP" ;
|
||||
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
|
||||
typedef Kokkos::Threads TestHostDevice ;
|
||||
const char TestHostDeviceName[] = "Kokkos::Threads" ;
|
||||
|
||||
#elif defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
||||
typedef Kokkos::Serial TestHostDevice ;
|
||||
const char TestHostDeviceName[] = "Kokkos::Serial" ;
|
||||
|
||||
#else
|
||||
# error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
|
||||
#endif
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
#include <PerfTestMDRange.hpp>
|
||||
|
||||
#include <PerfTestHexGrad.hpp>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
#include <PerfTestGramSchmidt.hpp>
|
||||
#include <PerfTestDriver.hpp>
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
namespace Test {
|
||||
|
||||
class host : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase()
|
||||
{
|
||||
if(Kokkos::hwloc::available()) {
|
||||
const unsigned numa_count = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
|
||||
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
|
||||
|
||||
unsigned threads_count = 0 ;
|
||||
|
||||
threads_count = std::max( 1u , numa_count )
|
||||
* std::max( 2u , cores_per_numa * threads_per_core );
|
||||
|
||||
TestHostDevice::initialize( threads_count );
|
||||
} else {
|
||||
const unsigned thread_count = 4 ;
|
||||
TestHostDevice::initialize( thread_count );
|
||||
}
|
||||
}
|
||||
|
||||
static void TearDownTestCase()
|
||||
{
|
||||
TestHostDevice::finalize();
|
||||
}
|
||||
};
|
||||
|
||||
//TEST_F( host, mdrange_lr ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
|
||||
//}
|
||||
|
||||
//TEST_F( host, mdrange_ll ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
|
||||
//}
|
||||
|
||||
TEST_F( host, hexgrad ) {
|
||||
EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
|
||||
}
|
||||
|
||||
TEST_F( host, gramschmidt ) {
|
||||
EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName ));
|
||||
}
|
||||
|
||||
} // namespace Test
|
||||
|
||||
|
||||
@ -42,8 +42,37 @@
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
namespace Test {
|
||||
int command_line_num_args(int n = 0) {
|
||||
static int n_args = 0;
|
||||
if(n>0)
|
||||
n_args = n;
|
||||
return n_args;
|
||||
}
|
||||
|
||||
const char* command_line_arg(int k, char** input_args = NULL) {
|
||||
static char** args;
|
||||
if(input_args != NULL)
|
||||
args = input_args;
|
||||
if(command_line_num_args() > k)
|
||||
return args[k];
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
::testing::InitGoogleTest(&argc,argv);
|
||||
return RUN_ALL_TESTS();
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
(void) Test::command_line_num_args(argc);
|
||||
(void) Test::command_line_arg(0,argv);
|
||||
|
||||
int result = RUN_ALL_TESTS();
|
||||
|
||||
Kokkos::finalize();
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -41,16 +41,28 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <cuda/TestCuda.hpp>
|
||||
#ifndef KOKKOS_TEST_THREADS_HPP
|
||||
#define KOKKOS_TEST_THREADS_HPP
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace Test {
|
||||
|
||||
TEST_F( cuda, view_api_a )
|
||||
{
|
||||
typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess> > view_texture_managed;
|
||||
typedef Kokkos::View< const int *, Kokkos::Cuda, Kokkos::MemoryTraits<Kokkos::RandomAccess | Kokkos::Unmanaged> > view_texture_unmanaged;
|
||||
extern int command_line_num_args(int n = 0);
|
||||
extern const char* command_line_arg(int k, char** input_args = NULL);
|
||||
|
||||
TestViewAPI< double, Kokkos::Cuda >();
|
||||
}
|
||||
class default_exec : public ::testing::Test {
|
||||
protected:
|
||||
static void SetUpTestCase() {
|
||||
}
|
||||
|
||||
static void TearDownTestCase() {
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Test
|
||||
|
||||
#define TEST_CATEGORY default_exec
|
||||
#define TEST_EXECSPACE Kokkos::DefaultExecutionSpace
|
||||
|
||||
#endif
|
||||
115
lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
Normal file
115
lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <PerfTest_Category.hpp>
|
||||
#include <Kokkos_Random.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
namespace Test {
|
||||
template<class Scalar>
|
||||
void custom_reduction_test(int N, int R, int num_trials) {
|
||||
Kokkos::Random_XorShift64_Pool<> rand_pool(183291);
|
||||
Kokkos::View<Scalar*> a("A",N);
|
||||
Kokkos::fill_random(a,rand_pool,1.0);
|
||||
|
||||
Scalar max;
|
||||
|
||||
// Warm up
|
||||
Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N/1024,32), KOKKOS_LAMBDA( const Kokkos::TeamPolicy<>::member_type& team, Scalar& lmax) {
|
||||
Scalar team_max = Scalar(0);
|
||||
for(int rr = 0; rr<R; rr++) {
|
||||
int i = team.league_rank();
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,32), [&] (const int& j, Scalar& thread_max) {
|
||||
Scalar t_max = Scalar(0);
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,32), [&] (const int& k, Scalar& max_) {
|
||||
const Scalar val = a((i*32 + j)*32 + k);
|
||||
if(val>lmax) lmax = val;
|
||||
if((k == 11) && (j==17) && (i==2)) lmax = 11.5;
|
||||
},Kokkos::Experimental::Max<Scalar>(t_max));
|
||||
if(t_max>thread_max) thread_max = t_max;
|
||||
},Kokkos::Experimental::Max<Scalar>(team_max));
|
||||
}
|
||||
if(team_max>lmax) lmax = team_max;
|
||||
},Kokkos::Experimental::Max<Scalar>(max));
|
||||
|
||||
// Timing
|
||||
Kokkos::Timer timer;
|
||||
for(int r = 0; r<num_trials; r++) {
|
||||
Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N/1024,32), KOKKOS_LAMBDA( const Kokkos::TeamPolicy<>::member_type& team, Scalar& lmax) {
|
||||
Scalar team_max = Scalar(0);
|
||||
for(int rr = 0; rr<R; rr++) {
|
||||
int i = team.league_rank();
|
||||
Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,32), [&] (const int& j, Scalar& thread_max) {
|
||||
Scalar t_max = Scalar(0);
|
||||
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,32), [&] (const int& k, Scalar& max_) {
|
||||
const Scalar val = a((i*32 + j)*32 + k);
|
||||
if(val>lmax) lmax = val;
|
||||
if((k == 11) && (j==17) && (i==2)) lmax = 11.5;
|
||||
},Kokkos::Experimental::Max<Scalar>(t_max));
|
||||
if(t_max>thread_max) thread_max = t_max;
|
||||
},Kokkos::Experimental::Max<Scalar>(team_max));
|
||||
}
|
||||
if(team_max>lmax) lmax = team_max;
|
||||
},Kokkos::Experimental::Max<Scalar>(max));
|
||||
}
|
||||
double time = timer.seconds();
|
||||
printf("%e %e %e\n",time,1.0*N*R*num_trials*sizeof(Scalar)/time/1024/1024/1024,max);
|
||||
}
|
||||
|
||||
TEST_F( default_exec, custom_reduction ) {
|
||||
int N = 100000;
|
||||
int R = 1000;
|
||||
int num_trials = 1;
|
||||
|
||||
if(command_line_num_args()>1)
|
||||
N = atoi(command_line_arg(1));
|
||||
if(command_line_num_args()>2)
|
||||
R = atoi(command_line_arg(2));
|
||||
if(command_line_num_args()>3)
|
||||
num_trials = atoi(command_line_arg(3));
|
||||
custom_reduction_test<double>(N,R,num_trials);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
25
lib/kokkos/core/perf_test/run_mempool.sh
Executable file
25
lib/kokkos/core/perf_test/run_mempool.sh
Executable file
@ -0,0 +1,25 @@
|
||||
#!/bin/bash -e
|
||||
NT=$1
|
||||
PROG="./KokkosCore_PerformanceTest_Mempool"
|
||||
COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --fill_level=70 --chunk_span=5 --repeat_inner=100"
|
||||
|
||||
postproc() {
|
||||
cat log | head -n 1 | rev | cut -d ' ' -f 1 | rev >> xvals
|
||||
cat log | tail -n 1 | rev | cut -d ' ' -f 1 | rev >> yvals
|
||||
}
|
||||
|
||||
for yset in 1 2 3
|
||||
do
|
||||
rm -f xvals yvals
|
||||
for x in 1 2 4 8 16 32
|
||||
do
|
||||
echo "yset $yset x factor $x"
|
||||
$PROG $COMMON_ARGS --alloc_size=`expr $x \* 1000000` --super_size=`expr $x \* 100000` > log
|
||||
postproc
|
||||
done
|
||||
rm -f yvals$yset
|
||||
mv yvals yvals$yset
|
||||
done
|
||||
|
||||
rm -f datapoints
|
||||
paste -d',' xvals yvals1 yvals2 yvals3 > datapoints
|
||||
21
lib/kokkos/core/perf_test/run_mempool_fill.sh
Executable file
21
lib/kokkos/core/perf_test/run_mempool_fill.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash -e
|
||||
NT=$1
|
||||
PROG="./KokkosCore_PerformanceTest_Mempool"
|
||||
COMMON_ARGS="--kokkos-threads=$NT --fill_stride=1 --alloc_size=10027008 --super_size=65536 --repeat_inner=100 --chunk_span=4 --repeat_outer=10"
|
||||
|
||||
postproc() {
|
||||
cat log | grep "fill ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_fill
|
||||
cat log | grep "cycle ops per second" | rev | cut -d ' ' -f 2 | rev >> yvals_cycle
|
||||
}
|
||||
|
||||
rm -f xvals yvals_fill yvals_cycle
|
||||
for x in 75 95
|
||||
do
|
||||
echo "test fill level $x"
|
||||
echo $x >> xvals
|
||||
$PROG $COMMON_ARGS --fill_level=$x 2>&1 | tee log
|
||||
postproc
|
||||
done
|
||||
|
||||
rm -f datapoints
|
||||
paste xvals yvals_fill yvals_cycle > datapoints.txt
|
||||
21
lib/kokkos/core/perf_test/run_taskdag.sh
Executable file
21
lib/kokkos/core/perf_test/run_taskdag.sh
Executable file
@ -0,0 +1,21 @@
|
||||
#!/bin/bash -e
|
||||
NT=$1
|
||||
PROG="./KokkosCore_PerformanceTest_TaskDAG"
|
||||
COMMON_ARGS="--kokkos-threads=$NT --alloc_size=10027008 --super_size=65536 --repeat_outer=10"
|
||||
|
||||
postproc() {
|
||||
cat log | grep "tasks per second" | rev | cut -d ' ' -f 2 | rev >> yvals
|
||||
}
|
||||
|
||||
rm -f xvals yvals
|
||||
for x in 21 23
|
||||
do
|
||||
echo "test input $x"
|
||||
echo $x >> xvals
|
||||
$PROG $COMMON_ARGS --input=$x 2>&1 | tee log
|
||||
postproc
|
||||
done
|
||||
|
||||
rm -f datapoints.txt
|
||||
paste xvals yvals > datapoints.txt
|
||||
|
||||
357
lib/kokkos/core/perf_test/test_mempool.cpp
Normal file
357
lib/kokkos/core/perf_test/test_mempool.cpp
Normal file
@ -0,0 +1,357 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
using ExecSpace = Kokkos::DefaultExecutionSpace ;
|
||||
using MemorySpace = Kokkos::DefaultExecutionSpace::memory_space ;
|
||||
|
||||
using MemoryPool = Kokkos::MemoryPool< ExecSpace > ;
|
||||
|
||||
struct TestFunctor {
|
||||
|
||||
typedef Kokkos::View< uintptr_t * , ExecSpace > ptrs_type ;
|
||||
|
||||
enum : unsigned { chunk = 32 };
|
||||
|
||||
MemoryPool pool ;
|
||||
ptrs_type ptrs ;
|
||||
unsigned chunk_span ;
|
||||
unsigned fill_stride ;
|
||||
unsigned range_iter ;
|
||||
unsigned repeat_inner ;
|
||||
|
||||
TestFunctor( size_t total_alloc_size
|
||||
, unsigned min_superblock_size
|
||||
, unsigned number_alloc
|
||||
, unsigned arg_stride_alloc
|
||||
, unsigned arg_chunk_span
|
||||
, unsigned arg_repeat )
|
||||
: pool()
|
||||
, ptrs()
|
||||
, chunk_span(0)
|
||||
, fill_stride(0)
|
||||
, repeat_inner(0)
|
||||
{
|
||||
MemorySpace m ;
|
||||
|
||||
const unsigned min_block_size = chunk ;
|
||||
const unsigned max_block_size = chunk * arg_chunk_span ;
|
||||
pool = MemoryPool( m , total_alloc_size
|
||||
, min_block_size
|
||||
, max_block_size
|
||||
, min_superblock_size );
|
||||
|
||||
ptrs = ptrs_type( Kokkos::view_alloc( m , "ptrs") , number_alloc );
|
||||
fill_stride = arg_stride_alloc ;
|
||||
chunk_span = arg_chunk_span ;
|
||||
range_iter = fill_stride * number_alloc ;
|
||||
repeat_inner = arg_repeat ;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
typedef long value_type ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
struct TagFill {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( TagFill , int i , value_type & update ) const noexcept
|
||||
{
|
||||
if ( 0 == i % fill_stride ) {
|
||||
|
||||
const int j = i / fill_stride ;
|
||||
|
||||
const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
|
||||
|
||||
ptrs(j) = (uintptr_t) pool.allocate(size_alloc);
|
||||
|
||||
if ( ptrs(j) ) ++update ;
|
||||
}
|
||||
}
|
||||
|
||||
bool test_fill()
|
||||
{
|
||||
typedef Kokkos::RangePolicy< ExecSpace , TagFill > policy ;
|
||||
|
||||
long result = 0 ;
|
||||
|
||||
Kokkos::parallel_reduce( policy(0,range_iter), *this , result );
|
||||
|
||||
if ( result == long(ptrs.extent(0)) ) return true;
|
||||
pool.print_state( std::cerr );
|
||||
return false;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
struct TagDel {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( TagDel , int i ) const noexcept
|
||||
{
|
||||
if ( 0 == i % fill_stride ) {
|
||||
|
||||
const int j = i / fill_stride ;
|
||||
|
||||
const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
|
||||
|
||||
pool.deallocate( (void*) ptrs(j) , size_alloc );
|
||||
}
|
||||
}
|
||||
|
||||
void test_del()
|
||||
{
|
||||
typedef Kokkos::RangePolicy< ExecSpace , TagDel > policy ;
|
||||
|
||||
Kokkos::parallel_for( policy(0,range_iter), *this );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
struct TagAllocDealloc {};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( TagAllocDealloc , int i , long & update ) const noexcept
|
||||
{
|
||||
if ( 0 == i % fill_stride ) {
|
||||
|
||||
const int j = i / fill_stride ;
|
||||
|
||||
if ( 0 == j % 3 ) {
|
||||
|
||||
for ( unsigned k = 0 ; k < repeat_inner ; ++k ) {
|
||||
|
||||
const unsigned size_alloc = chunk * ( 1 + ( j % chunk_span ) );
|
||||
|
||||
pool.deallocate( (void*) ptrs(j) , size_alloc );
|
||||
|
||||
ptrs(j) = (uintptr_t) pool.allocate(size_alloc);
|
||||
|
||||
if ( 0 == ptrs(j) ) update++ ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool test_alloc_dealloc()
|
||||
{
|
||||
typedef Kokkos::RangePolicy< ExecSpace , TagAllocDealloc > policy ;
|
||||
|
||||
long error_count = 0 ;
|
||||
|
||||
Kokkos::parallel_reduce( policy(0,range_iter), *this , error_count );
|
||||
|
||||
return 0 == error_count ;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
int main( int argc , char* argv[] )
|
||||
{
|
||||
static const char help_flag[] = "--help" ;
|
||||
static const char alloc_size_flag[] = "--alloc_size=" ;
|
||||
static const char super_size_flag[] = "--super_size=" ;
|
||||
static const char chunk_span_flag[] = "--chunk_span=" ;
|
||||
static const char fill_stride_flag[] = "--fill_stride=" ;
|
||||
static const char fill_level_flag[] = "--fill_level=" ;
|
||||
static const char repeat_outer_flag[] = "--repeat_outer=" ;
|
||||
static const char repeat_inner_flag[] = "--repeat_inner=" ;
|
||||
|
||||
long total_alloc_size = 1000000 ;
|
||||
int min_superblock_size = 10000 ;
|
||||
int chunk_span = 5 ;
|
||||
int fill_stride = 1 ;
|
||||
int fill_level = 70 ;
|
||||
int repeat_outer = 1 ;
|
||||
int repeat_inner = 1 ;
|
||||
|
||||
int ask_help = 0 ;
|
||||
|
||||
for(int i=1;i<argc;i++)
|
||||
{
|
||||
const char * const a = argv[i];
|
||||
|
||||
if ( ! strncmp(a,help_flag,strlen(help_flag) ) ) ask_help = 1 ;
|
||||
|
||||
if ( ! strncmp(a,alloc_size_flag,strlen(alloc_size_flag) ) )
|
||||
total_alloc_size = atol( a + strlen(alloc_size_flag) );
|
||||
|
||||
if ( ! strncmp(a,super_size_flag,strlen(super_size_flag) ) )
|
||||
min_superblock_size = atoi( a + strlen(super_size_flag) );
|
||||
|
||||
if ( ! strncmp(a,fill_stride_flag,strlen(fill_stride_flag) ) )
|
||||
fill_stride = atoi( a + strlen(fill_stride_flag) );
|
||||
|
||||
if ( ! strncmp(a,fill_level_flag,strlen(fill_level_flag) ) )
|
||||
fill_level = atoi( a + strlen(fill_level_flag) );
|
||||
|
||||
if ( ! strncmp(a,chunk_span_flag,strlen(chunk_span_flag) ) )
|
||||
chunk_span = atoi( a + strlen(chunk_span_flag) );
|
||||
|
||||
if ( ! strncmp(a,repeat_outer_flag,strlen(repeat_outer_flag) ) )
|
||||
repeat_outer = atoi( a + strlen(repeat_outer_flag) );
|
||||
|
||||
if ( ! strncmp(a,repeat_inner_flag,strlen(repeat_inner_flag) ) )
|
||||
repeat_inner = atoi( a + strlen(repeat_inner_flag) );
|
||||
}
|
||||
|
||||
int chunk_span_bytes = 0;
|
||||
for (int i = 0; i < chunk_span; ++i) {
|
||||
auto chunk_bytes = TestFunctor::chunk * ( 1 + i );
|
||||
if (chunk_bytes < 64) chunk_bytes = 64;
|
||||
auto block_bytes_lg2 = Kokkos::Impl::integral_power_of_two_that_contains( chunk_bytes );
|
||||
auto block_bytes = (1 << block_bytes_lg2);
|
||||
chunk_span_bytes += block_bytes;
|
||||
}
|
||||
auto actual_superblock_bytes_lg2 = Kokkos::Impl::integral_power_of_two_that_contains( min_superblock_size );
|
||||
auto actual_superblock_bytes = (1 << actual_superblock_bytes_lg2);
|
||||
auto superblock_mask = actual_superblock_bytes - 1;
|
||||
auto nsuperblocks = (total_alloc_size + superblock_mask) >> actual_superblock_bytes_lg2;
|
||||
auto actual_total_bytes = nsuperblocks * actual_superblock_bytes;
|
||||
auto bytes_wanted = (actual_total_bytes * fill_level) / 100;
|
||||
auto chunk_spans = bytes_wanted / chunk_span_bytes;
|
||||
auto number_alloc = int( chunk_spans * chunk_span );
|
||||
|
||||
if ( ask_help ) {
|
||||
std::cout << "command line options:"
|
||||
<< " " << help_flag
|
||||
<< " " << alloc_size_flag << "##"
|
||||
<< " " << super_size_flag << "##"
|
||||
<< " " << fill_stride_flag << "##"
|
||||
<< " " << fill_level_flag << "##"
|
||||
<< " " << chunk_span_flag << "##"
|
||||
<< " " << repeat_outer_flag << "##"
|
||||
<< " " << repeat_inner_flag << "##"
|
||||
<< std::endl ;
|
||||
return 0;
|
||||
}
|
||||
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
double sum_fill_time = 0;
|
||||
double sum_cycle_time = 0;
|
||||
double sum_both_time = 0;
|
||||
double min_fill_time = std::numeric_limits<double>::max();
|
||||
double min_cycle_time = std::numeric_limits<double>::max();
|
||||
double min_both_time = std::numeric_limits<double>::max();
|
||||
//one alloc in fill, alloc/dealloc pair in repeat_inner
|
||||
for ( int i = 0 ; i < repeat_outer ; ++i ) {
|
||||
|
||||
TestFunctor functor( total_alloc_size
|
||||
, min_superblock_size
|
||||
, number_alloc
|
||||
, fill_stride
|
||||
, chunk_span
|
||||
, repeat_inner );
|
||||
|
||||
Kokkos::Impl::Timer timer ;
|
||||
|
||||
if ( ! functor.test_fill() ) {
|
||||
Kokkos::abort("fill ");
|
||||
}
|
||||
|
||||
auto t0 = timer.seconds();
|
||||
|
||||
if ( ! functor.test_alloc_dealloc() ) {
|
||||
Kokkos::abort("alloc/dealloc ");
|
||||
}
|
||||
|
||||
auto t1 = timer.seconds();
|
||||
auto this_fill_time = t0;
|
||||
auto this_cycle_time = t1 - t0;
|
||||
auto this_both_time = t1;
|
||||
sum_fill_time += this_fill_time;
|
||||
sum_cycle_time += this_cycle_time;
|
||||
sum_both_time += this_both_time;
|
||||
min_fill_time = std::min(min_fill_time, this_fill_time);
|
||||
min_cycle_time = std::min(min_cycle_time, this_cycle_time);
|
||||
min_both_time = std::min(min_both_time, this_both_time);
|
||||
}
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
printf( "\"mempool: alloc super stride level span inner outer number\" %ld %d %d %d %d %d %d %d\n"
|
||||
, total_alloc_size
|
||||
, min_superblock_size
|
||||
, fill_stride
|
||||
, fill_level
|
||||
, chunk_span
|
||||
, repeat_inner
|
||||
, repeat_outer
|
||||
, number_alloc );
|
||||
|
||||
auto avg_fill_time = sum_fill_time / repeat_outer;
|
||||
auto avg_cycle_time = sum_cycle_time / repeat_outer;
|
||||
auto avg_both_time = sum_both_time / repeat_outer;
|
||||
|
||||
printf( "\"mempool: fill time (min, avg)\" %.8f %.8f\n"
|
||||
, min_fill_time
|
||||
, avg_fill_time );
|
||||
|
||||
printf( "\"mempool: cycle time (min, avg)\" %.8f %.8f\n"
|
||||
, min_cycle_time
|
||||
, avg_cycle_time );
|
||||
|
||||
printf( "\"mempool: test time (min, avg)\" %.8f %.8f\n"
|
||||
, min_both_time
|
||||
, avg_both_time );
|
||||
|
||||
printf( "\"mempool: fill ops per second (max, avg)\" %g %g\n"
|
||||
, number_alloc / min_fill_time
|
||||
, number_alloc / avg_fill_time );
|
||||
|
||||
printf( "\"mempool: cycle ops per second (max, avg)\" %g %g\n"
|
||||
, (2 * number_alloc * repeat_inner) / min_cycle_time
|
||||
, (2 * number_alloc * repeat_inner) / avg_cycle_time );
|
||||
}
|
||||
|
||||
284
lib/kokkos/core/perf_test/test_taskdag.cpp
Normal file
284
lib/kokkos/core/perf_test/test_taskdag.cpp
Normal file
@ -0,0 +1,284 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_TASKDAG ) || \
|
||||
defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
|
||||
int main()
|
||||
{
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <cstdlib>
|
||||
#include <limits>
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
using ExecSpace = Kokkos::DefaultExecutionSpace ;
|
||||
|
||||
inline
|
||||
long eval_fib( long n )
|
||||
{
|
||||
constexpr long mask = 0x03;
|
||||
|
||||
long fib[4] = { 0, 1, 0, 0 };
|
||||
|
||||
for ( long i = 2; i <= n; ++i ) {
|
||||
fib[ i & mask ] = fib[ ( i - 1 ) & mask ] + fib[ ( i - 2 ) & mask ];
|
||||
}
|
||||
|
||||
return fib[ n & mask ];
|
||||
}
|
||||
|
||||
inline
|
||||
long fib_alloc_count( long n )
|
||||
{
|
||||
constexpr long mask = 0x03;
|
||||
|
||||
long count[4] = { 1, 1, 0, 0 };
|
||||
|
||||
for ( long i = 2; i <= n; ++i ) {
|
||||
count[ i & mask ] = 2 // this task plus the 'when_all' task
|
||||
+ count[ ( i - 1 ) & mask ]
|
||||
+ count[ ( i - 2 ) & mask ];
|
||||
}
|
||||
|
||||
return count[ n & mask ];
|
||||
}
|
||||
|
||||
template< class Space >
|
||||
struct TestFib {
|
||||
|
||||
using Scheduler = Kokkos::TaskScheduler< Space > ;
|
||||
using MemorySpace = typename Scheduler::memory_space ;
|
||||
using MemberType = typename Scheduler::member_type ;
|
||||
using FutureType = Kokkos::Future< long , Space > ;
|
||||
|
||||
typedef long value_type ;
|
||||
|
||||
Scheduler sched ;
|
||||
FutureType dep[2] ;
|
||||
const value_type n ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TestFib( const Scheduler & arg_sched , const value_type arg_n )
|
||||
: sched( arg_sched ), dep{} , n( arg_n ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( const MemberType & , value_type & result ) noexcept
|
||||
{
|
||||
if ( n < 2 ) {
|
||||
result = n ;
|
||||
}
|
||||
else if ( ! dep[0].is_null() && ! dep[1].is_null() ) {
|
||||
result = dep[0].get() + dep[1].get();
|
||||
}
|
||||
else {
|
||||
// Spawn new children and respawn myself to sum their results.
|
||||
// Spawn lower value at higher priority as it has a shorter
|
||||
// path to completion.
|
||||
|
||||
dep[1] = Kokkos::task_spawn
|
||||
( Kokkos::TaskSingle( sched, Kokkos::TaskPriority::High )
|
||||
, TestFib( sched, n - 2 ) );
|
||||
|
||||
dep[0] = Kokkos::task_spawn
|
||||
( Kokkos::TaskSingle( sched )
|
||||
, TestFib( sched, n - 1 ) );
|
||||
|
||||
Kokkos::Future< ExecSpace > fib_all = Kokkos::when_all( dep, 2 );
|
||||
|
||||
if ( ! dep[0].is_null() && ! dep[1].is_null() && ! fib_all.is_null() ) {
|
||||
// High priority to retire this branch.
|
||||
Kokkos::respawn( this, fib_all, Kokkos::TaskPriority::High );
|
||||
}
|
||||
else {
|
||||
Kokkos::abort("Failed nested task spawn (allocation)");
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
int main( int argc , char* argv[] )
|
||||
{
|
||||
static const char help[] = "--help" ;
|
||||
static const char alloc_size[] = "--alloc_size=" ;
|
||||
static const char super_size[] = "--super_size=" ;
|
||||
static const char repeat_outer[] = "--repeat_outer=" ;
|
||||
static const char input_value[] = "--input=" ;
|
||||
|
||||
long total_alloc_size = 1000000 ;
|
||||
int min_superblock_size = 10000 ;
|
||||
int test_repeat_outer = 1 ;
|
||||
int fib_input = 4 ;
|
||||
|
||||
int ask_help = 0 ;
|
||||
|
||||
for(int i=1;i<argc;i++)
|
||||
{
|
||||
const char * const a = argv[i];
|
||||
|
||||
if ( ! strncmp(a,help,strlen(help) ) ) ask_help = 1 ;
|
||||
|
||||
if ( ! strncmp(a,alloc_size,strlen(alloc_size) ) )
|
||||
total_alloc_size = atol( a + strlen(alloc_size) );
|
||||
|
||||
if ( ! strncmp(a,super_size,strlen(super_size) ) )
|
||||
min_superblock_size = atoi( a + strlen(super_size) );
|
||||
|
||||
if ( ! strncmp(a,repeat_outer,strlen(repeat_outer) ) )
|
||||
test_repeat_outer = atoi( a + strlen(repeat_outer) );
|
||||
|
||||
if ( ! strncmp(a,input_value,strlen(input_value) ) )
|
||||
fib_input = atoi( a + strlen(input_value) );
|
||||
}
|
||||
|
||||
const long fib_output = eval_fib( fib_input );
|
||||
const long number_alloc = fib_alloc_count( fib_input );
|
||||
|
||||
const unsigned min_block_size = 32 ;
|
||||
const unsigned max_block_size = 128 ;
|
||||
|
||||
long task_count_max = 0 ;
|
||||
long task_count_accum = 0 ;
|
||||
long test_result = 0 ;
|
||||
|
||||
if ( ask_help ) {
|
||||
std::cout << "command line options:"
|
||||
<< " " << help
|
||||
<< " " << alloc_size << "##"
|
||||
<< " " << super_size << "##"
|
||||
<< " " << input_value << "##"
|
||||
<< " " << repeat_outer << "##"
|
||||
<< std::endl ;
|
||||
return -1;
|
||||
}
|
||||
|
||||
typedef TestFib< ExecSpace > Functor ;
|
||||
|
||||
Kokkos::initialize(argc,argv);
|
||||
|
||||
Functor::Scheduler sched( Functor::MemorySpace()
|
||||
, total_alloc_size
|
||||
, min_block_size
|
||||
, max_block_size
|
||||
, min_superblock_size
|
||||
);
|
||||
|
||||
Functor::FutureType f =
|
||||
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
|
||||
, Functor( sched , fib_input )
|
||||
);
|
||||
|
||||
Kokkos::wait( sched );
|
||||
|
||||
test_result = f.get();
|
||||
|
||||
task_count_max = sched.allocated_task_count_max();
|
||||
task_count_accum = sched.allocated_task_count_accum();
|
||||
|
||||
if ( number_alloc != task_count_accum ) {
|
||||
std::cout << " number_alloc( " << number_alloc << " )"
|
||||
<< " != task_count_accum( " << task_count_accum << " )"
|
||||
<< std::endl ;
|
||||
}
|
||||
|
||||
if ( fib_output != test_result ) {
|
||||
std::cout << " answer( " << fib_output << " )"
|
||||
<< " != result( " << test_result << " )"
|
||||
<< std::endl ;
|
||||
}
|
||||
|
||||
if ( fib_output != test_result || number_alloc != task_count_accum ) {
|
||||
printf(" TEST FAILED\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
double min_time = std::numeric_limits<double>::max();
|
||||
double time_sum = 0;
|
||||
|
||||
for ( int i = 0 ; i < test_repeat_outer ; ++i ) {
|
||||
Kokkos::Impl::Timer timer ;
|
||||
|
||||
Functor::FutureType ftmp =
|
||||
Kokkos::host_spawn( Kokkos::TaskSingle( sched )
|
||||
, Functor( sched , fib_input )
|
||||
);
|
||||
|
||||
Kokkos::wait( sched );
|
||||
auto this_time = timer.seconds();
|
||||
min_time = std::min(min_time, this_time);
|
||||
time_sum += this_time;
|
||||
}
|
||||
|
||||
auto avg_time = time_sum / test_repeat_outer;
|
||||
|
||||
Kokkos::finalize();
|
||||
|
||||
printf( "\"taskdag: alloc super repeat input output task-accum task-max\" %ld %d %d %d %ld %ld %ld\n"
|
||||
, total_alloc_size
|
||||
, min_superblock_size
|
||||
, test_repeat_outer
|
||||
, fib_input
|
||||
, fib_output
|
||||
, task_count_accum
|
||||
, task_count_max );
|
||||
|
||||
printf( "\"taskdag: time (min, avg)\" %g %g\n", min_time, avg_time);
|
||||
printf( "\"taskdag: tasks per second (max, avg)\" %g %g\n"
|
||||
, number_alloc / min_time
|
||||
, number_alloc / avg_time );
|
||||
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -44,14 +44,12 @@
|
||||
#ifndef KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
|
||||
#define KOKKOS_CUDA_EXP_ITERATE_TILE_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
|
||||
@ -1298,3 +1296,4 @@ protected:
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,11 +45,10 @@
|
||||
#define KOKKOS_CUDAEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <string>
|
||||
#include <cstdint>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
@ -99,6 +98,8 @@ CudaSpace::size_type cuda_internal_maximum_warp_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_grid_count();
|
||||
CudaSpace::size_type cuda_internal_maximum_shared_words();
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count();
|
||||
|
||||
CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
|
||||
CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
|
||||
@ -146,7 +147,7 @@ Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink = false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -319,3 +320,4 @@ struct CudaParallelLaunch< DriverType , false > {
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
#endif /* defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
|
||||
|
||||
|
||||
@ -41,16 +41,15 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <atomic>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
@ -890,9 +889,9 @@ void init_lock_arrays_cuda_space() {
|
||||
}
|
||||
}
|
||||
|
||||
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
void* cuda_resize_scratch_space(std::int64_t bytes, bool force_shrink) {
|
||||
static void* ptr = NULL;
|
||||
static size_t current_size = 0;
|
||||
static std::int64_t current_size = 0;
|
||||
if(current_size == 0) {
|
||||
current_size = bytes;
|
||||
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
|
||||
@ -911,5 +910,7 @@ void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
@ -75,7 +73,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
|
||||
new( functor ) DestructFunctor( arg_destruct );
|
||||
|
||||
record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
|
||||
|
||||
|
||||
return record ;
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_ERROR_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
@ -67,3 +65,4 @@ inline void cuda_internal_safe_call( cudaError e , const char * name, const char
|
||||
|
||||
#endif //KOKKOS_ENABLE_CUDA
|
||||
#endif //KOKKOS_CUDA_ERROR_HPP
|
||||
|
||||
|
||||
@ -44,11 +44,11 @@
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Kokkos interfaces */
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
@ -56,7 +56,7 @@
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Standard 'C' libraries */
|
||||
#include <stdlib.h>
|
||||
#include <cstdlib>
|
||||
|
||||
/* Standard 'C++' libraries */
|
||||
#include <vector>
|
||||
@ -404,9 +404,23 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
// Query what compute capability architecture a kernel executes:
|
||||
m_cudaArch = cuda_kernel_arch();
|
||||
|
||||
if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
|
||||
int compiled_major = m_cudaArch / 100;
|
||||
int compiled_minor = ( m_cudaArch % 100 ) / 10;
|
||||
|
||||
if ( compiled_major < 5 && cudaProp.major >= 5 ) {
|
||||
std::stringstream ss;
|
||||
ss << "Kokkos::Cuda::initialize ERROR: running kernels compiled for compute capability "
|
||||
<< compiled_major << "." << compiled_minor
|
||||
<< " (< 5.0) on device with compute capability "
|
||||
<< cudaProp.major << "." << cudaProp.minor
|
||||
<< " (>=5.0), this would give incorrect results!"
|
||||
<< std::endl ;
|
||||
std::string msg = ss.str();
|
||||
Kokkos::abort( msg.c_str() );
|
||||
}
|
||||
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
|
||||
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
|
||||
<< ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
|
||||
<< compiled_major << "." << compiled_minor
|
||||
<< " on device with compute capability "
|
||||
<< cudaProp.major << "." << cudaProp.minor
|
||||
<< " , this will likely reduce potential performance."
|
||||
@ -661,6 +675,15 @@ void CudaInternal::finalize()
|
||||
Cuda::size_type cuda_internal_multiprocessor_count()
|
||||
{ return CudaInternal::singleton().m_multiProcCount ; }
|
||||
|
||||
CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
|
||||
{
|
||||
// Compute capability 5.0 through 6.2
|
||||
enum : int { max_resident_blocks_per_multiprocessor = 32 };
|
||||
|
||||
return CudaInternal::singleton().m_multiProcCount
|
||||
* max_resident_blocks_per_multiprocessor ;
|
||||
};
|
||||
|
||||
Cuda::size_type cuda_internal_maximum_warp_count()
|
||||
{ return CudaInternal::singleton().m_maxWarpCount ; }
|
||||
|
||||
@ -772,8 +795,10 @@ void Cuda::fence()
|
||||
Kokkos::Impl::cuda_device_synchronize();
|
||||
}
|
||||
|
||||
const char* Cuda::name() { return "Cuda"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_IMPL_PREVENT_LINK_ERROR() {}
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,19 +36,18 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_INTERNAL_HPP
|
||||
#define KOKKOS_CUDA_INTERNAL_HPP
|
||||
#include<iostream>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include<iostream>
|
||||
#include <Cuda/Kokkos_Cuda_Error.hpp>
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -45,8 +45,6 @@
|
||||
#define KOKKOS_CUDA_REDUCESCAN_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <utility>
|
||||
@ -63,6 +61,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Shuffle operations require input to be a register (stack) variable
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
@ -140,77 +139,6 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Reduce within a warp over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*
|
||||
* Cannot use "butterfly" pattern because floating point
|
||||
* addition is non-associative. Therefore, must broadcast
|
||||
* the final result.
|
||||
*/
|
||||
template< class Reducer >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_reduce( Reducer const & reducer )
|
||||
{
|
||||
static_assert(
|
||||
std::is_reference< typename Reducer::reference_type >::value , "" );
|
||||
|
||||
if ( 1 < blockDim.x ) {
|
||||
|
||||
typename Reducer::value_type tmp ;
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
|
||||
|
||||
if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
|
||||
}
|
||||
|
||||
// Broadcast from root "lane" to all other "lanes"
|
||||
|
||||
cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Inclusive scan over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Algorithm is concurrent bottom-up reductions in triangular pattern
|
||||
* where each CUDA thread is the root of a reduction tree from the
|
||||
* zeroth CUDA thread to itself.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*/
|
||||
template< typename ValueType >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
|
||||
{
|
||||
ValueType tmp ;
|
||||
|
||||
// Bottom up:
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
|
||||
|
||||
cuda_shfl_up( tmp , local , i , blockDim.x );
|
||||
|
||||
if ( i <= threadIdx.x ) { local += tmp ; }
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
@ -247,12 +175,12 @@ inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
const unsigned step = 32 / blockDim.x;
|
||||
unsigned shift = STEP_WIDTH;
|
||||
const int step = 32 / blockDim.x;
|
||||
int shift = STEP_WIDTH;
|
||||
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
|
||||
if(id < STEP_WIDTH ) {
|
||||
result[id] = value;
|
||||
@ -297,7 +225,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
cuda_intra_block_reduction(value,join,max_active_thread);
|
||||
|
||||
const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
const int id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
|
||||
//One thread in the block writes block result to global scratch_memory
|
||||
if(id == 0 ) {
|
||||
@ -329,35 +257,35 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
|
||||
//Reduce all global values with splitting work over threads in one warp
|
||||
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
|
||||
for(int i=id; i<gridDim.x; i+=step_size) {
|
||||
for(int i=id; i<(int)gridDim.x; i+=step_size) {
|
||||
value_type tmp = global[i];
|
||||
join(value, tmp);
|
||||
}
|
||||
|
||||
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
|
||||
if (blockDim.x*blockDim.y > 1) {
|
||||
if (int(blockDim.x*blockDim.y) > 1) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 1,32);
|
||||
if( id + 1 < gridDim.x )
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 2) {
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < gridDim.x )
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 4) {
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < gridDim.x )
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 8) {
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < gridDim.x )
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
if (blockDim.x*blockDim.y > 16) {
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < gridDim.x )
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
join(value, tmp);
|
||||
}
|
||||
}
|
||||
@ -370,6 +298,166 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
|
||||
#endif
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
unsigned int shift = 1;
|
||||
|
||||
ValueType result = reducer.reference();
|
||||
//Reduce over values from threads with different threadIdx.y
|
||||
while(blockDim.x * shift < 32 ) {
|
||||
const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
|
||||
//Only join if upper thread is active (this allows non power of two for blockDim.y
|
||||
if(threadIdx.y + shift < max_active_thread)
|
||||
reducer.join(result , tmp);
|
||||
shift*=2;
|
||||
}
|
||||
|
||||
result = shfl(result,0,32);
|
||||
reducer.reference() = result;
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_inter_warp_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
ValueType value = reducer.reference();
|
||||
const int step = 32 / blockDim.x;
|
||||
int shift = STEP_WIDTH;
|
||||
const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
|
||||
if(id < STEP_WIDTH ) {
|
||||
result[id] = value;
|
||||
}
|
||||
__syncthreads();
|
||||
while (shift<=max_active_thread/step) {
|
||||
if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
|
||||
reducer.join(result[id%STEP_WIDTH],value);
|
||||
}
|
||||
__syncthreads();
|
||||
shift+=STEP_WIDTH;
|
||||
}
|
||||
|
||||
|
||||
value = result[0];
|
||||
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
|
||||
reducer.join(value,result[i]);
|
||||
|
||||
reducer.reference() = value;
|
||||
}
|
||||
|
||||
template< class ReducerType >
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value >::type
|
||||
cuda_intra_block_reduction( const ReducerType& reducer,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
cuda_intra_warp_reduction(reducer,max_active_thread);
|
||||
cuda_inter_warp_reduction(reducer,max_active_thread);
|
||||
}
|
||||
|
||||
template< class ReducerType>
|
||||
__device__ inline
|
||||
typename std::enable_if< Kokkos::is_reducer<ReducerType>::value , bool >::type
|
||||
cuda_inter_block_reduction( const ReducerType& reducer,
|
||||
Cuda::size_type * const m_scratch_space,
|
||||
Cuda::size_type * const m_scratch_flags,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
typedef typename ReducerType::value_type* pointer_type;
|
||||
typedef typename ReducerType::value_type value_type;
|
||||
|
||||
//Do the intra-block reduction with shfl operations and static shared memory
|
||||
cuda_intra_block_reduction(reducer,max_active_thread);
|
||||
|
||||
value_type value = reducer.reference();
|
||||
|
||||
const int id = threadIdx.y*blockDim.x + threadIdx.x;
|
||||
|
||||
//One thread in the block writes block result to global scratch_memory
|
||||
if(id == 0 ) {
|
||||
pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
|
||||
*global = value;
|
||||
}
|
||||
|
||||
//One warp of last block performs inter block reduction through loading the block values from global scratch_memory
|
||||
bool last_block = false;
|
||||
|
||||
__syncthreads();
|
||||
if ( id < 32 ) {
|
||||
Cuda::size_type count;
|
||||
|
||||
//Figure out whether this is the last block
|
||||
if(id == 0)
|
||||
count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
|
||||
count = Kokkos::shfl(count,0,32);
|
||||
|
||||
//Last block does the inter block reduction
|
||||
if( count == gridDim.x - 1) {
|
||||
//set flag back to zero
|
||||
if(id == 0)
|
||||
*m_scratch_flags = 0;
|
||||
last_block = true;
|
||||
reducer.init(value);
|
||||
|
||||
pointer_type const volatile global = (pointer_type) m_scratch_space ;
|
||||
|
||||
//Reduce all global values with splitting work over threads in one warp
|
||||
const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
|
||||
for(int i=id; i<(int)gridDim.x; i+=step_size) {
|
||||
value_type tmp = global[i];
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
|
||||
//Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
|
||||
if (int(blockDim.x*blockDim.y) > 1) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 1,32);
|
||||
if( id + 1 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 2) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 2,32);
|
||||
if( id + 2 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 4) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 4,32);
|
||||
if( id + 4 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 8) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 8,32);
|
||||
if( id + 8 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
if (int(blockDim.x*blockDim.y) > 16) {
|
||||
value_type tmp = Kokkos::shfl_down(value, 16,32);
|
||||
if( id + 16 < int(gridDim.x) )
|
||||
reducer.join(value, tmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//The last block has in its thread=0 the global reduction value through "value"
|
||||
return last_block;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// See section B.17 of Cuda C Programming Guide Version 3.2
|
||||
// for discussion of
|
||||
@ -529,11 +617,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
|
||||
size_type * const global = global_data + word_count.value * block_id ;
|
||||
|
||||
#if (__CUDA_ARCH__ < 500)
|
||||
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
|
||||
#else
|
||||
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
#endif
|
||||
//#if (__CUDA_ARCH__ < 500)
|
||||
for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
|
||||
//#else
|
||||
// for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
|
||||
//#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,15 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -120,7 +121,7 @@ printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
|
||||
}
|
||||
|
||||
if ( 0 == warp_lane ) {
|
||||
queue->complete( task.ptr );
|
||||
queue->complete( task.ptr );
|
||||
}
|
||||
}
|
||||
} while(1);
|
||||
@ -157,7 +158,7 @@ printf("cuda_task_queue_execute before\n");
|
||||
// If not large enough then set the stack size, in bytes:
|
||||
//
|
||||
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
|
||||
|
||||
|
||||
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
@ -173,7 +174,7 @@ printf("cuda_task_queue_execute after\n");
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_KOKKOS_CUDA_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
#define KOKKOS_IMPL_CUDA_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -325,7 +326,7 @@ ValueType shfl_warp_broadcast
|
||||
return Kokkos::shfl(val, src_lane, width);
|
||||
}
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
/*// all-reduce across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
@ -351,7 +352,7 @@ void parallel_reduce
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
}*/
|
||||
|
||||
// all-reduce across corresponding vector lanes between team members within warp
|
||||
// if no join() provided, use sum
|
||||
@ -382,13 +383,36 @@ void parallel_reduce
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
//TODO what is the point of creating this temporary?
|
||||
ValueType result = ValueType();
|
||||
reducer.init(result);
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
strided_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1,val2); },
|
||||
result,
|
||||
loop_boundaries.thread.team_size(),
|
||||
blockDim.x);
|
||||
reducer.reference() = shfl_warp_broadcast<ValueType>( result, threadIdx.x, Impl::CudaTraits::WarpSize );
|
||||
}
|
||||
// all-reduce within team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
/*template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
@ -404,7 +428,7 @@ void parallel_reduce
|
||||
|
||||
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
}*/
|
||||
|
||||
// all-reduce within team members within warp
|
||||
// if no join() provided, use sum
|
||||
@ -436,6 +460,28 @@ void parallel_reduce
|
||||
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const ReducerType& reducer) {
|
||||
|
||||
typedef typename ReducerType::value_type ValueType;
|
||||
|
||||
ValueType result = ValueType();
|
||||
reducer.init(result);
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
multi_shfl_warp_reduction(
|
||||
[&] (ValueType& val1, const ValueType& val2) { reducer.join(val1, val2); },
|
||||
result,
|
||||
blockDim.x);
|
||||
reducer.reference() = shfl_warp_broadcast<ValueType>( result, 0, blockDim.x );
|
||||
}
|
||||
// scan across corresponding vector lanes between team members within warp
|
||||
// assume vec_length*team_size == warp_size
|
||||
// blockDim.x == vec_length == stride
|
||||
|
||||
982
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
Normal file
982
lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
Normal file
@ -0,0 +1,982 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CUDA_TEAM_HPP
|
||||
#define KOKKOS_CUDA_TEAM_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <utility>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< typename Type >
|
||||
struct CudaJoinFunctor {
|
||||
typedef Type value_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void join( volatile value_type & update ,
|
||||
volatile const value_type & input )
|
||||
{ update += input ; }
|
||||
};
|
||||
|
||||
/**\brief Team member_type passed to TeamPolicy or TeamTask closures.
|
||||
*
|
||||
* Cuda thread blocks for team closures are dimensioned as:
|
||||
* blockDim.x == number of "vector lanes" per "thread"
|
||||
* blockDim.y == number of "threads" per team
|
||||
* blockDim.z == number of teams in a block
|
||||
* where
|
||||
* A set of teams exactly fill a warp OR a team is the whole block
|
||||
* ( 0 == WarpSize % ( blockDim.x * blockDim.y ) )
|
||||
* OR
|
||||
* ( 1 == blockDim.z )
|
||||
*
|
||||
* Thus when 1 < blockDim.z the team is warp-synchronous
|
||||
* and __syncthreads should not be called in team collectives.
|
||||
*
|
||||
* When multiple teams are mapped onto a single block then the
|
||||
* total available shared memory must be partitioned among teams.
|
||||
*/
|
||||
class CudaTeamMember {
|
||||
private:
|
||||
|
||||
typedef Kokkos::Cuda execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
void * m_team_reduce ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_reduce_size ;
|
||||
int m_league_rank ;
|
||||
int m_league_size ;
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space & team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space &
|
||||
team_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space &
|
||||
thread_scratch(const int& level) const
|
||||
{ return m_team_shared.set_team_thread_mode(level,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return threadIdx.y ;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
return blockDim.y ;
|
||||
#else
|
||||
return 1;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
if ( 1 == blockDim.z ) __syncthreads(); // team == block
|
||||
else __threadfence_block(); // team <= warp
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast( ValueType & val, const int& thread_id) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
if ( 1 == blockDim.z ) { // team == block
|
||||
__syncthreads();
|
||||
// Wait for shared data write until all threads arrive here
|
||||
if ( threadIdx.x == 0 && threadIdx.y == thread_id ) {
|
||||
*((ValueType*) m_team_reduce) = val ;
|
||||
}
|
||||
__syncthreads(); // Wait for shared data read until root thread writes
|
||||
val = *((ValueType*) m_team_reduce);
|
||||
}
|
||||
else { // team <= warp
|
||||
ValueType tmp( val ); // input might not be a register variable
|
||||
cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/**\brief Reduction across a team
|
||||
*
|
||||
* Mapping of teams onto blocks:
|
||||
* blockDim.x is "vector lanes"
|
||||
* blockDim.y is team "threads"
|
||||
* blockDim.z is number of teams per block
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power two
|
||||
* blockDim.x <= CudaTraits::WarpSize
|
||||
* ( 0 == CudaTraits::WarpSize % ( blockDim.x * blockDim.y )
|
||||
* OR
|
||||
* ( 1 == blockDim.z )
|
||||
*/
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
team_reduce( ReducerType const & reducer ) const noexcept
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
// reduce within the warp using shuffle
|
||||
|
||||
const int wx =
|
||||
( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
|
||||
|
||||
// Root of each vector lane reduces:
|
||||
if ( 0 == threadIdx.x && wx < i ) {
|
||||
reducer.join( tmp , reducer.reference() );
|
||||
}
|
||||
}
|
||||
|
||||
if ( 1 < blockDim.z ) { // team <= warp
|
||||
// broadcast result from root vector lange of root thread
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp
|
||||
, blockDim.x * threadIdx.y , CudaTraits::WarpSize );
|
||||
|
||||
}
|
||||
else { // team == block
|
||||
// Reduce across warps using shared memory
|
||||
// Broadcast result within block
|
||||
|
||||
// Number of warps, blockDim.y may not be power of two:
|
||||
const int nw = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Warp index:
|
||||
const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
// Number of shared memory entries for the reduction:
|
||||
int nsh = m_team_reduce_size / sizeof(value_type);
|
||||
|
||||
// Using at most one entry per warp:
|
||||
if ( nw < nsh ) nsh = nw ;
|
||||
|
||||
__syncthreads(); // Wait before shared data write
|
||||
|
||||
if ( 0 == wx && wy < nsh ) {
|
||||
((value_type*) m_team_reduce)[wy] = tmp ;
|
||||
}
|
||||
|
||||
// When more warps than shared entries:
|
||||
for ( int i = nsh ; i < nw ; i += nsh ) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if ( 0 == wx && i <= wy ) {
|
||||
const int k = wy - i ;
|
||||
if ( k < nsh ) {
|
||||
reducer.join( *((value_type*) m_team_reduce + k) , tmp );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// One warp performs the inter-warp reduction:
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Start at power of two covering nsh
|
||||
|
||||
for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nsh ) {
|
||||
reducer.join( ((value_type*)m_team_reduce)[wx]
|
||||
, ((value_type*)m_team_reduce)[k] );
|
||||
__threadfence_block();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for reduction
|
||||
|
||||
// Broadcast result to all threads
|
||||
reducer.reference() = *((value_type*)m_team_reduce);
|
||||
}
|
||||
|
||||
#endif /* #ifdef __CUDA_ARCH__ */
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Type team_scan( const Type & value , Type * const global_accum ) const
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
Type * const base_data = (Type *) m_team_reduce ;
|
||||
|
||||
__syncthreads(); // Don't write in to shared data until all threads have entered this function
|
||||
|
||||
if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
|
||||
|
||||
base_data[ threadIdx.y + 1 ] = value ;
|
||||
|
||||
Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
|
||||
|
||||
if ( global_accum ) {
|
||||
if ( blockDim.y == threadIdx.y + 1 ) {
|
||||
base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
|
||||
}
|
||||
__syncthreads(); // Wait for atomic
|
||||
base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
|
||||
}
|
||||
|
||||
return base_data[ threadIdx.y ];
|
||||
#else
|
||||
return Type();
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const {
|
||||
return this->template team_scan<Type>( value , 0 );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
vector_reduce( ReducerType const & reducer )
|
||||
{
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(blockDim.x == 1) return;
|
||||
|
||||
// Intra vector lane shuffle reduction:
|
||||
typename ReducerType::value_type tmp ( reducer.reference() );
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
cuda_shfl_down( reducer.reference() , tmp , i , blockDim.x );
|
||||
if ( threadIdx.x < i ) { reducer.join( tmp , reducer.reference() ); }
|
||||
}
|
||||
|
||||
// Broadcast from root lane to all other lanes.
|
||||
// Cannot use "butterfly" algorithm to avoid the broadcast
|
||||
// because floating point summation is not associative
|
||||
// and thus different threads could have different results.
|
||||
|
||||
cuda_shfl( reducer.reference() , tmp , 0 , blockDim.x );
|
||||
#endif
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
/**\brief Global reduction across all blocks
|
||||
*
|
||||
* Return !0 if reducer contains the final value
|
||||
*/
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< is_reducer< ReducerType >::value , int >::type
|
||||
global_reduce( ReducerType const & reducer
|
||||
, int * const global_scratch_flags
|
||||
, void * const global_scratch_space
|
||||
, void * const shmem
|
||||
, int const shmem_size
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
typedef typename ReducerType::value_type value_type ;
|
||||
typedef value_type volatile * pointer_type ;
|
||||
|
||||
// Number of shared memory entries for the reduction:
|
||||
const int nsh = shmem_size / sizeof(value_type);
|
||||
|
||||
// Number of CUDA threads in the block, rank within the block
|
||||
const int nid = blockDim.x * blockDim.y * blockDim.z ;
|
||||
const int tid = threadIdx.x + blockDim.x * (
|
||||
threadIdx.y + blockDim.y * threadIdx.z );
|
||||
|
||||
// Reduces within block using all available shared memory
|
||||
// Contributes if it is the root "vector lane"
|
||||
|
||||
// wn == number of warps in the block
|
||||
// wx == which lane within the warp
|
||||
// wy == which warp within the block
|
||||
|
||||
const int wn = ( nid + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
|
||||
const int wx = tid & CudaTraits::WarpIndexMask ;
|
||||
const int wy = tid >> CudaTraits::WarpIndexShift ;
|
||||
|
||||
//------------------------
|
||||
{ // Intra warp shuffle reduction from contributing CUDA threads
|
||||
|
||||
value_type tmp( reducer.reference() );
|
||||
|
||||
for ( int i = CudaTraits::WarpSize ; blockDim.x <= ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( reducer.reference(), tmp, i, CudaTraits::WarpSize );
|
||||
|
||||
// Root of each vector lane reduces "thread" contribution
|
||||
if ( 0 == threadIdx.x && wx < i ) {
|
||||
reducer.join( & tmp , reducer.data() );
|
||||
}
|
||||
}
|
||||
|
||||
// Reduce across warps using shared memory.
|
||||
// Number of warps may not be power of two.
|
||||
|
||||
__syncthreads(); // Wait before shared data write
|
||||
|
||||
// Number of shared memory entries for the reduction
|
||||
// is at most one per warp
|
||||
const int nentry = wn < nsh ? wn : nsh ;
|
||||
|
||||
if ( 0 == wx && wy < nentry ) {
|
||||
// Root thread of warp 'wy' has warp's value to contribute
|
||||
((value_type*) shmem)[wy] = tmp ;
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for write to be visible to block
|
||||
|
||||
// When more warps than shared entries
|
||||
// then warps must take turns joining their contribution
|
||||
// to the designated shared memory entry.
|
||||
for ( int i = nentry ; i < wn ; i += nentry ) {
|
||||
|
||||
const int k = wy - i ;
|
||||
|
||||
if ( 0 == wx && i <= wy && k < nentry ) {
|
||||
// Root thread of warp 'wy' has warp's value to contribute
|
||||
reducer.join( ((value_type*) shmem) + k , & tmp );
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for write to be visible to block
|
||||
}
|
||||
|
||||
// One warp performs the inter-warp reduction:
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Start fan-in at power of two covering nentry
|
||||
|
||||
for ( int i = ( 1 << ( 32 - __clz(nentry-1) ) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + k );
|
||||
__threadfence_block(); // Wait for write to be visible to warp
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
//------------------------
|
||||
{ // Write block's value to global_scratch_memory
|
||||
|
||||
int last_block = 0 ;
|
||||
|
||||
if ( 0 == wx ) {
|
||||
reducer.copy( ((pointer_type)global_scratch_space)
|
||||
+ blockIdx.x * reducer.length()
|
||||
, reducer.data() );
|
||||
|
||||
__threadfence(); // Wait until global write is visible.
|
||||
|
||||
last_block = gridDim.x ==
|
||||
1 + Kokkos::atomic_fetch_add(global_scratch_flags,1);
|
||||
|
||||
// If last block then reset count
|
||||
if ( last_block ) *global_scratch_flags = 0 ;
|
||||
}
|
||||
|
||||
last_block = __syncthreads_or( last_block );
|
||||
|
||||
if ( ! last_block ) return 0 ;
|
||||
|
||||
}
|
||||
//------------------------
|
||||
// Last block reads global_scratch_memory into shared memory.
|
||||
|
||||
const int nentry = nid < gridDim.x ?
|
||||
( nid < nsh ? nid : nsh ) :
|
||||
( gridDim.x < nsh ? gridDim.x : nsh ) ;
|
||||
|
||||
// nentry = min( nid , nsh , gridDim.x )
|
||||
|
||||
// whole block reads global memory into shared memory:
|
||||
|
||||
if ( tid < nentry ) {
|
||||
|
||||
const int offset = tid * reducer.length();
|
||||
|
||||
reducer.copy( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space) + offset );
|
||||
|
||||
for ( int i = nentry + tid ; i < gridDim.x ; i += nentry ) {
|
||||
reducer.join( ((pointer_type)shmem) + offset
|
||||
, ((pointer_type)global_scratch_space)
|
||||
+ i * reducer.length() );
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads(); // Wait for writes to be visible to block
|
||||
|
||||
if ( 0 == wy ) {
|
||||
|
||||
// Iterate to reduce shared memory to single warp fan-in size
|
||||
|
||||
const int nreduce = CudaTraits::WarpSize < nentry
|
||||
? CudaTraits::WarpSize : nentry ;
|
||||
|
||||
// nreduce = min( CudaTraits::WarpSize , nsh , gridDim.x )
|
||||
|
||||
if ( wx < nreduce && nreduce < nentry ) {
|
||||
for ( int i = nreduce + wx ; i < nentry ; i += nreduce ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + i );
|
||||
}
|
||||
__threadfence_block(); // Wait for writes to be visible to warp
|
||||
}
|
||||
|
||||
// Start fan-in at power of two covering nentry
|
||||
|
||||
for ( int i = ( 1 << ( 32 - __clz(nreduce-1) ) ) ; ( i >>= 1 ) ; ) {
|
||||
const int k = wx + i ;
|
||||
if ( wx < i && k < nreduce ) {
|
||||
reducer.join( ((pointer_type)shmem) + wx
|
||||
, ((pointer_type)shmem) + k );
|
||||
__threadfence_block(); // Wait for writes to be visible to warp
|
||||
}
|
||||
}
|
||||
|
||||
if ( 0 == wx ) {
|
||||
reducer.copy( reducer.data() , (pointer_type)shmem );
|
||||
return 1 ;
|
||||
}
|
||||
}
|
||||
return 0 ;
|
||||
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
CudaTeamMember( void * shared
|
||||
, const int shared_begin
|
||||
, const int shared_size
|
||||
, void* scratch_level_1_ptr
|
||||
, const int scratch_level_1_size
|
||||
, const int arg_league_rank
|
||||
, const int arg_league_size )
|
||||
: m_team_reduce( shared )
|
||||
, m_team_shared( ((char *)shared) + shared_begin , shared_size, scratch_level_1_ptr, scratch_level_1_size)
|
||||
, m_team_reduce_size( shared_begin )
|
||||
, m_league_rank( arg_league_rank )
|
||||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
|
||||
};
|
||||
|
||||
} // namspace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
typedef iType index_type;
|
||||
const CudaTeamMember& member;
|
||||
const iType start;
|
||||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count)
|
||||
: member(thread_)
|
||||
, start( 0 )
|
||||
, end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_)
|
||||
: member(thread_)
|
||||
, start( begin_ )
|
||||
, end( end_ ) {}
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const CudaTeamMember, const iType& count)
|
||||
: start( 0 ), end( count ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const iType& count)
|
||||
: start( 0 ), end( count ) {}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType & count ) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::CudaTeamMember >
|
||||
TeamThreadRange( const Impl::CudaTeamMember & thread, const iType1 & begin, const iType2 & end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Inter-thread parallel_for.
|
||||
*
|
||||
* Executes closure(iType i) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the the calling thread team.
|
||||
*/
|
||||
template<typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y )
|
||||
closure(i);
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Inter-thread parallel_reduce with a reducer.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the
|
||||
* calling thread team and a summation of val is
|
||||
* performed and put into result.
|
||||
*/
|
||||
template< typename iType, class Closure, class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, const ReducerType & reducer
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,reducer.reference());
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/** \brief Inter-thread parallel_reduce assuming summation.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all threads of the
|
||||
* calling thread team and a summation of val is
|
||||
* performed and put into result.
|
||||
*/
|
||||
template< typename iType, class Closure, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember> &
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
, ValueType & result
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
Kokkos::Experimental::Sum<ValueType> reducer(result);
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for( iType i = loop_boundaries.start + threadIdx.y
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.y ) {
|
||||
closure(i,result);
|
||||
}
|
||||
|
||||
loop_boundaries.member.team_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel_for.
|
||||
*
|
||||
* Executes closure(iType i) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of the the calling thread.
|
||||
*/
|
||||
template<typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* Calls closure(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed using +=
|
||||
* and output into result.
|
||||
*
|
||||
* The identity value for the += operator is assumed to be the default
|
||||
* constructed value.
|
||||
*/
|
||||
template< typename iType, class Closure, class ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
const & loop_boundaries
|
||||
, Closure const & closure
|
||||
, ReducerType const & reducer )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
reducer.init( reducer.reference() );
|
||||
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i,reducer.reference());
|
||||
}
|
||||
|
||||
Impl::CudaTeamMember::vector_reduce( reducer );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* Calls closure(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed using +=
|
||||
* and output into result.
|
||||
*
|
||||
* The identity value for the += operator is assumed to be the default
|
||||
* constructed value.
|
||||
*/
|
||||
template< typename iType, class Closure, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! is_reducer< ValueType >::value >::type
|
||||
parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember>
|
||||
const & loop_boundaries
|
||||
, Closure const & closure
|
||||
, ValueType & result )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
result = ValueType();
|
||||
|
||||
for ( iType i = loop_boundaries.start + threadIdx.x
|
||||
; i < loop_boundaries.end
|
||||
; i += blockDim.x ) {
|
||||
closure(i,result);
|
||||
}
|
||||
|
||||
Impl::CudaTeamMember::vector_reduce(
|
||||
Kokkos::Experimental::Sum<ValueType>(result ) );
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum.
|
||||
*
|
||||
* Executes closure(iType i, ValueType & val, bool final) for each i=[0..N)
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes in the
|
||||
* thread and a scan operation is performed.
|
||||
* The last call to closure has final == true.
|
||||
*/
|
||||
template< typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
( const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
|
||||
loop_boundaries
|
||||
, const Closure & closure
|
||||
)
|
||||
{
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
|
||||
// Extract value_type from closure
|
||||
|
||||
using value_type =
|
||||
typename Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::SCAN
|
||||
, void
|
||||
, Closure >::value_type ;
|
||||
|
||||
// Loop through boundaries by vector-length chunks
|
||||
// must scan at each iteration
|
||||
|
||||
value_type accum = 0 ;
|
||||
|
||||
// All thread "lanes" must loop the same number of times.
|
||||
// Determine an loop end for all thread "lanes."
|
||||
// Requires:
|
||||
// blockDim.x is power of two and thus
|
||||
// ( end % blockDim.x ) == ( end & ( blockDim.x - 1 ) )
|
||||
// 1 <= blockDim.x <= CudaTraits::WarpSize
|
||||
|
||||
const int mask = blockDim.x - 1 ;
|
||||
const int rem = loop_boundaries.end & mask ; // == end % blockDim.x
|
||||
const int end = loop_boundaries.end + ( rem ? blockDim.x - rem : 0 );
|
||||
|
||||
for ( int i = threadIdx.x ; i < end ; i += blockDim.x ) {
|
||||
|
||||
value_type val = 0 ;
|
||||
|
||||
// First acquire per-lane contributions:
|
||||
if ( i < loop_boundaries.end ) closure( i , val , false );
|
||||
|
||||
value_type sval = val ;
|
||||
|
||||
// Bottom up inclusive scan in triangular pattern
|
||||
// where each CUDA thread is the root of a reduction tree
|
||||
// from the zeroth "lane" to itself.
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int j = 1 ; j < blockDim.x ; j <<= 1 ) {
|
||||
value_type tmp = 0 ;
|
||||
Impl::cuda_shfl_up( tmp , sval , j , blockDim.x );
|
||||
if ( j <= threadIdx.x ) { sval += tmp ; }
|
||||
}
|
||||
|
||||
// Include accumulation and remove value for exclusive scan:
|
||||
val = accum + sval - val ;
|
||||
|
||||
// Provide exclusive scan value:
|
||||
if ( i < loop_boundaries.end ) closure( i , val , true );
|
||||
|
||||
// Accumulate the last value in the inclusive scan:
|
||||
Impl::cuda_shfl( sval , sval , mask , blockDim.x );
|
||||
|
||||
accum += sval ;
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0) lambda(val);
|
||||
val = shfl(val,0,blockDim.x);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
if(threadIdx.x == 0 && threadIdx.y == 0) {
|
||||
lambda(val);
|
||||
}
|
||||
single_struct.team_member.team_broadcast(val,0);
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* defined( __CUDACC__ ) */
|
||||
|
||||
#endif /* #ifndef KOKKOS_CUDA_TEAM_HPP */
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,8 +44,6 @@
|
||||
#define KOKKOS_CUDA_VECTORIZATION_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
@ -296,3 +294,4 @@ namespace Impl {
|
||||
|
||||
#endif // KOKKOS_ENABLE_CUDA
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,7 +44,7 @@
|
||||
#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -46,7 +46,7 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
#include "Kokkos_Macros.hpp"
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <cuda.h>
|
||||
@ -82,6 +82,8 @@ void cuda_abort( const char * const message )
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_CUDA_ABORT_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined(__CUDACC__) && defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
|
||||
|
||||
|
||||
@ -236,13 +236,13 @@ struct MDRangePolicy
|
||||
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
|
||||
|
||||
#else
|
||||
if(m_lower.size()!=rank || m_upper.size() != rank)
|
||||
if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
|
||||
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
|
||||
|
||||
for ( auto i = 0; i < rank; ++i ) {
|
||||
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
|
||||
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
|
||||
if(tile.size()==rank)
|
||||
if(static_cast<int>(tile.size())==rank)
|
||||
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
|
||||
else
|
||||
m_tile[i] = 0;
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
#define KOKKOS_COMPLEX_HPP
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_NumericTraits.hpp>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
@ -324,9 +325,30 @@ public:
|
||||
im_ /= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& src) {
|
||||
return (re_ == src.re_) && (im_ == src.im_);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType src) {
|
||||
return (re_ == src) && (im_ == RealType(0));
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& src) {
|
||||
return (re_ != src.re_) || (im_ != src.im_);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType src) {
|
||||
return (re_ != src) || (im_ != RealType(0));
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
//! Binary + operator for complex.
|
||||
//! Binary + operator for complex complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
@ -334,6 +356,22 @@ operator + (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for complex scalar.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () + y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary + operator for scalar complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x + y.real (), y.imag ());
|
||||
}
|
||||
|
||||
//! Unary + operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -350,6 +388,22 @@ operator - (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for complex scalar.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const RealType& y) {
|
||||
return complex<RealType> (x.real () - y , x.imag ());
|
||||
}
|
||||
|
||||
//! Binary - operator for scalar complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x - y.real (), - y.imag ());
|
||||
}
|
||||
|
||||
//! Unary - operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -395,6 +449,16 @@ operator * (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& y, const RealType& x) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
//! Imaginary part of a complex number.
|
||||
template<class RealType>
|
||||
@ -415,7 +479,25 @@ template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType abs (const complex<RealType>& x) {
|
||||
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
|
||||
return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
return std::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
}
|
||||
|
||||
//! Power of a complex number
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::complex<RealType> pow (const complex<RealType>& x, const RealType& e) {
|
||||
RealType r = abs(x);
|
||||
RealType phi = std::atan(x.imag()/x.real());
|
||||
return std::pow(r,e) * Kokkos::complex<RealType>(std::cos(phi*e),std::sin(phi*e));
|
||||
}
|
||||
|
||||
//! Square root of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::complex<RealType> sqrt (const complex<RealType>& x) {
|
||||
RealType r = abs(x);
|
||||
RealType phi = std::atan(x.imag()/x.real());
|
||||
return std::sqrt(r) * Kokkos::complex<RealType>(std::cos(phi*0.5),std::sin(phi*0.5));
|
||||
}
|
||||
|
||||
//! Conjugate of a complex number.
|
||||
@ -425,6 +507,19 @@ complex<RealType> conj (const complex<RealType>& x) {
|
||||
return complex<RealType> (real (x), -imag (x));
|
||||
}
|
||||
|
||||
//! Exponential of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType> exp (const complex<RealType>& x) {
|
||||
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
|
||||
}
|
||||
|
||||
//! Exponential of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType> pow (const complex<RealType>& x) {
|
||||
return std::exp(x.real()) * complex<RealType> (std::cos (x.imag()), std::sin(x.imag()));
|
||||
}
|
||||
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
@ -461,6 +556,14 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
}
|
||||
}
|
||||
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
operator / (const RealType1& x, const complex<RealType2>& y) {
|
||||
return complex<RealType1> (x)/y;
|
||||
}
|
||||
|
||||
//! Equality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -468,9 +571,13 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) == real (y) && imag (x) == imag (y);
|
||||
}
|
||||
|
||||
//! Equality operator for std::complex and Kokkos::complex.
|
||||
/// \brief Equality operator for std::complex and Kokkos::complex.
|
||||
///
|
||||
/// This cannot be a device function, since std::real is not.
|
||||
/// Otherwise, CUDA builds will give compiler warnings ("warning:
|
||||
/// calling a constexpr __host__ function("real") from a __host__
|
||||
/// __device__ function("operator==") is not allowed").
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) == real (y) && std::imag (x) == imag (y);
|
||||
}
|
||||
@ -533,6 +640,15 @@ std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
|
||||
}
|
||||
|
||||
|
||||
template<class T>
|
||||
struct reduction_identity<Kokkos::complex<T> > {
|
||||
typedef reduction_identity<T> t_red_ident;
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> sum()
|
||||
{return Kokkos::complex<T>(t_red_ident::sum(),t_red_ident::sum());}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex<T> prod()
|
||||
{return Kokkos::complex<T>(t_red_ident::prod(),t_red_ident::sum());}
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_COMPLEX_HPP
|
||||
|
||||
@ -200,7 +200,14 @@ public:
|
||||
, Kokkos::DefaultHostExecutionSpace , execution_space
|
||||
>::type host_execution_space ;
|
||||
#else
|
||||
typedef execution_space host_execution_space ;
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
typedef typename std::conditional
|
||||
< std::is_same< execution_space , Kokkos::Experimental::OpenMPTarget >::value
|
||||
, Kokkos::DefaultHostExecutionSpace , execution_space
|
||||
>::type host_execution_space ;
|
||||
#else
|
||||
typedef execution_space host_execution_space ;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef typename std::conditional
|
||||
|
||||
@ -57,11 +57,16 @@
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
#endif
|
||||
|
||||
//#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
#include <Kokkos_OpenMPTarget.hpp>
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
//#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
#include <Kokkos_Qthreads.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
#include <Kokkos_Threads.hpp>
|
||||
#endif
|
||||
|
||||
@ -69,8 +74,8 @@
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <Kokkos_Pair.hpp>
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
#include <Kokkos_Array.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
@ -167,3 +172,4 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -96,7 +96,7 @@ class Serial; ///< Execution space main process on CPU.
|
||||
class Qthreads; ///< Execution space with Qthreads back-end.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
class Threads; ///< Execution space with pthreads back-end.
|
||||
#endif
|
||||
|
||||
@ -104,6 +104,14 @@ class Threads; ///< Execution space with pthreads back-end.
|
||||
class OpenMP; ///< OpenMP execution space.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
namespace Experimental {
|
||||
class OpenMPTarget; ///< OpenMPTarget execution space.
|
||||
class OpenMPTargetSpace;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
class CudaSpace; ///< Memory space on Cuda GPU
|
||||
class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM
|
||||
@ -121,12 +129,14 @@ struct Device;
|
||||
|
||||
/// Define Kokkos::DefaultExecutionSpace as per configuration option
|
||||
/// or chosen from the enabled execution spaces in the following order:
|
||||
/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
|
||||
/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
typedef Cuda DefaultExecutionSpace;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
@ -136,7 +146,7 @@ namespace Kokkos {
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultExecutionSpace;
|
||||
#else
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
@ -149,7 +159,7 @@ namespace Kokkos {
|
||||
typedef Serial DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Threads DefaultHostExecutionSpace;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Qthreads DefaultHostExecutionSpace;
|
||||
@ -254,6 +264,21 @@ template< class FunctorType, class ExecPolicy, class ExecutionSapce =
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
namespace Experimental {
|
||||
template<class ScalarType , class Space = HostSpace> struct Sum;
|
||||
template<class ScalarType , class Space = HostSpace> struct Prod;
|
||||
template<class ScalarType , class Space = HostSpace> struct Min;
|
||||
template<class ScalarType , class Space = HostSpace> struct Max;
|
||||
template<class ScalarType , class Space = HostSpace> struct MinMax;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MinLoc;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MaxLoc;
|
||||
template<class ScalarType , class Index, class Space = HostSpace> struct MinMaxLoc;
|
||||
template<class ScalarType , class Space = HostSpace> struct BAnd;
|
||||
template<class ScalarType , class Space = HostSpace> struct BOr;
|
||||
template<class ScalarType , class Space = HostSpace> struct LAnd;
|
||||
template<class ScalarType , class Space = HostSpace> struct LOr;
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
|
||||
|
||||
|
||||
@ -44,12 +44,11 @@
|
||||
#ifndef KOKKOS_CUDA_HPP
|
||||
#define KOKKOS_CUDA_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// If CUDA execution space is enabled then use this header file.
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <iosfwd>
|
||||
#include <vector>
|
||||
|
||||
@ -214,6 +213,8 @@ public:
|
||||
//@}
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
static const char* name();
|
||||
|
||||
private:
|
||||
|
||||
cudaStream_t m_stream ;
|
||||
@ -291,6 +292,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
|
||||
#include <Cuda/Kokkos_CudaExec.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_View.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Team.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
|
||||
@ -300,5 +302,3 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
|
||||
#endif /* #ifndef KOKKOS_CUDA_HPP */
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,10 +44,11 @@
|
||||
#ifndef KOKKOS_CUDASPACE_HPP
|
||||
#define KOKKOS_CUDASPACE_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <iosfwd>
|
||||
#include <typeinfo>
|
||||
#include <string>
|
||||
@ -717,7 +718,7 @@ private:
|
||||
static ::cudaTextureObject_t
|
||||
attach_texture_object( const unsigned sizeof_alias
|
||||
, void * const alloc_ptr
|
||||
, const size_t alloc_size );
|
||||
, const size_t alloc_size );
|
||||
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
|
||||
@ -565,5 +565,3 @@ ThreadVectorRange( const TeamMemberType&, const iType& count );
|
||||
|
||||
#endif /* #define KOKKOS_EXECPOLICY_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -44,12 +44,11 @@
|
||||
#ifndef KOKKOS_HBWSPACE_HPP
|
||||
#define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
@ -114,7 +113,7 @@ public:
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
@ -348,5 +347,5 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokk
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
||||
#endif // #define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
|
||||
@ -117,7 +117,7 @@ public:
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
@ -265,7 +265,7 @@ public:
|
||||
return (SharedAllocationRecord *) 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
@ -316,3 +316,4 @@ struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // #define KOKKOS_HOSTSPACE_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -47,7 +47,7 @@
|
||||
#ifndef KOKKOS_LAYOUT_HPP
|
||||
#define KOKKOS_LAYOUT_HPP
|
||||
|
||||
#include <stddef.h>
|
||||
#include <cstddef>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
@ -62,7 +62,7 @@ enum { ARRAY_LAYOUT_MAX_RANK = 8 };
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Layout left" indicates a mapping where the leftmost index i0
|
||||
/// refers to contiguous access, and strides increase for dimensions
|
||||
@ -95,7 +95,7 @@ struct LayoutLeft {
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Right layout" indicates a mapping where the rightmost index ik
|
||||
/// refers to contiguous access, and strides increase for dimensions
|
||||
@ -130,7 +130,7 @@ struct LayoutStride {
|
||||
typedef LayoutStride array_layout ;
|
||||
|
||||
size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;
|
||||
|
||||
LayoutStride( LayoutStride const & ) = default ;
|
||||
LayoutStride( LayoutStride && ) = default ;
|
||||
@ -192,7 +192,7 @@ struct LayoutStride {
|
||||
///
|
||||
/// This is an example of a \c MemoryLayout template parameter of
|
||||
/// View. The memory layout describes how View maps from a
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
/// multi-index (i0, i1, ..., ik) to a memory location.
|
||||
///
|
||||
/// "Tiled layout" indicates a mapping to contiguously stored
|
||||
/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
|
||||
|
||||
@ -48,9 +48,10 @@
|
||||
/** Pick up configure / build options via #define macros:
|
||||
*
|
||||
* KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces
|
||||
* KOKKOS_ENABLE_PTHREAD Kokkos::Threads execution space
|
||||
* KOKKOS_ENABLE_THREADS Kokkos::Threads execution space
|
||||
* KOKKOS_ENABLE_QTHREADS Kokkos::Qthreads execution space
|
||||
* KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space
|
||||
* KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget execution space
|
||||
* KOKKOS_ENABLE_HWLOC HWLOC library is available.
|
||||
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive!
|
||||
* KOKKOS_ENABLE_MPI Negotiate MPI/execution space interactions.
|
||||
@ -414,6 +415,7 @@
|
||||
// There is zero or one default execution space specified.
|
||||
|
||||
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
|
||||
@ -424,15 +426,18 @@
|
||||
// If default is not specified then chose from enabled execution spaces.
|
||||
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
#elif defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#elif defined( KOKKOS_ENABLE_THREADS )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
|
||||
@ -459,10 +464,17 @@
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Enable Profiling by default
|
||||
// If compiling with CUDA then must be using CUDA 8 or better
|
||||
// and use relocateable device code to enable the task policy.
|
||||
// nvcc relocatable device code option: --relocatable-device-code=true
|
||||
|
||||
#ifndef KOKKOS_ENABLE_PROFILING
|
||||
#define KOKKOS_ENABLE_PROFILING 1
|
||||
#if ( defined( KOKKOS_ENABLE_CUDA ) )
|
||||
#if ( 8000 <= CUDA_VERSION ) && defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
|
||||
#endif // #ifndef KOKKOS_MACROS_HPP
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
217
lib/kokkos/core/src/Kokkos_NumericTraits.hpp
Normal file
217
lib/kokkos/core/src/Kokkos_NumericTraits.hpp
Normal file
@ -0,0 +1,217 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_NUMERICTRAITS_HPP
|
||||
#define KOKKOS_NUMERICTRAITS_HPP
|
||||
|
||||
#include<climits>
|
||||
#include<cfloat>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class T>
|
||||
struct reduction_identity; /*{
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T sum() { return T(); } // 0
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T prod() // 1
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom prod reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T max() // minimum value
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom max reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T min() // maximum value
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom min reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T bor() // 0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom bor reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T band() // !0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom band reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T lor() // 0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom lor reduction type"); return T(); }
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static T land() // !0, only for integer type
|
||||
{ static_assert( false, "Missing specialization of Kokkos::reduction_identity for custom land reduction type"); return T(); }
|
||||
};*/
|
||||
|
||||
template<>
|
||||
struct reduction_identity<signed char> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char sum() {return static_cast<signed char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char prod() {return static_cast<signed char>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char max() {return SCHAR_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char min() {return SCHAR_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char bor() {return static_cast<signed char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char band() {return ~static_cast<signed char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char lor() {return static_cast<signed char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static signed char land() {return static_cast<signed char>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<short> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short sum() {return static_cast<short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short prod() {return static_cast<short>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short max() {return SHRT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short min() {return SHRT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short bor() {return static_cast<short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short band() {return ~static_cast<short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short lor() {return static_cast<short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static short land() {return static_cast<short>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<int> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int sum() {return static_cast<int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int prod() {return static_cast<int>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int max() {return INT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int min() {return INT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int bor() {return static_cast<int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int band() {return ~static_cast<int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int lor() {return static_cast<int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static int land() {return static_cast<int>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long sum() {return static_cast<long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long prod() {return static_cast<long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long max() {return LLONG_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long min() {return LLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long bor() {return static_cast<long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long band() {return ~static_cast<long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long lor() {return static_cast<long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long land() {return static_cast<long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long sum() {return static_cast<long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long prod() {return static_cast<long long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long max() {return LLONG_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long min() {return LLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long bor() {return static_cast<long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long band() {return ~static_cast<long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long lor() {return static_cast<long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long long land() {return static_cast<long long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned char> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char sum() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char prod() {return static_cast<unsigned char>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char max() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char min() {return UCHAR_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char bor() {return static_cast<unsigned char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char band() {return ~static_cast<unsigned char>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char lor() {return static_cast<unsigned char>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned char land() {return static_cast<unsigned char>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned short> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short sum() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short prod() {return static_cast<unsigned short>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short max() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short min() {return USHRT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short bor() {return static_cast<unsigned short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short band() {return ~static_cast<unsigned short>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short lor() {return static_cast<unsigned short>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned short land() {return static_cast<unsigned short>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned int> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int sum() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int prod() {return static_cast<unsigned int>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int max() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int min() {return UINT_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int bor() {return static_cast<unsigned int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int band() {return ~static_cast<unsigned int>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int lor() {return static_cast<unsigned int>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned int land() {return static_cast<unsigned int>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long sum() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long prod() {return static_cast<unsigned long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long max() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long min() {return ULONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long bor() {return static_cast<unsigned long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long band() {return ~static_cast<unsigned long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long lor() {return static_cast<unsigned long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long land() {return static_cast<unsigned long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<unsigned long long> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long sum() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long prod() {return static_cast<unsigned long long>(1);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long max() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long min() {return ULLONG_MAX;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long bor() {return static_cast<unsigned long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long band() {return ~static_cast<unsigned long long>(0x0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long lor() {return static_cast<unsigned long long>(0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static unsigned long long land() {return static_cast<unsigned long long>(1);}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<float> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum() {return static_cast<float>(0.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float max() {return FLT_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static float min() {return FLT_MAX;}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum() {return static_cast<double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double max() {return DBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct reduction_identity<long double> {
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast<long double>(0.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return LDBL_MIN;}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -44,22 +44,23 @@
|
||||
#ifndef KOKKOS_OPENMP_HPP
|
||||
#define KOKKOS_OPENMP_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP)
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP) && !defined(_OPENMP)
|
||||
#if !defined(_OPENMP)
|
||||
#error "You enabled Kokkos OpenMP support without enabling OpenMP in the compiler!"
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
#include <Kokkos_HBWSpace.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskScheduler.hpp>
|
||||
@ -79,26 +80,26 @@ public:
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef OpenMP execution_space ;
|
||||
using execution_space = OpenMP;
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
typedef Experimental::HBWSpace memory_space ;
|
||||
using memory_space = Experimental::HBWSpace;
|
||||
#else
|
||||
typedef HostSpace memory_space ;
|
||||
using memory_space = HostSpace;
|
||||
#endif
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
using device_type = Kokkos::Device<execution_space,memory_space>;
|
||||
|
||||
typedef LayoutRight array_layout ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
using array_layout = LayoutRight;
|
||||
using size_type = memory_space::size_type;
|
||||
|
||||
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
|
||||
using scratch_memory_space = ScratchMemorySpace< OpenMP >;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos execution spaces must implement.
|
||||
//@{
|
||||
|
||||
inline static bool in_parallel() { return omp_in_parallel(); }
|
||||
inline static bool in_parallel();
|
||||
|
||||
/** \brief Set the device in a "sleep" state. A noop for OpenMP. */
|
||||
static bool sleep();
|
||||
@ -153,6 +154,8 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -164,7 +167,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::OpenMP::memory_space
|
||||
, Kokkos::OpenMP::scratch_memory_space
|
||||
>
|
||||
@ -191,7 +194,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
|
||||
|
||||
@ -201,4 +204,3 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
|
||||
#endif /* #ifndef KOKKOS_OPENMP_HPP */
|
||||
|
||||
|
||||
|
||||
186
lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
Normal file
186
lib/kokkos/core/src/Kokkos_OpenMPTarget.hpp
Normal file
@ -0,0 +1,186 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGET_HPP
|
||||
#define KOKKOS_OPENMPTARGET_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_TaskPolicy.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/// \class OpenMPTarget
|
||||
/// \brief Kokkos device for multicore processors in the host memory space.
|
||||
class OpenMPTarget {
|
||||
public:
|
||||
//------------------------------------
|
||||
//! \name Type declarations that all Kokkos devices must provide.
|
||||
//@{
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef OpenMPTarget execution_space ;
|
||||
typedef OpenMPTargetSpace memory_space ;
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
typedef LayoutLeft array_layout ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
|
||||
typedef ScratchMemorySpace< OpenMPTarget > scratch_memory_space ;
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
//! \name Functions that all Kokkos execution spaces must implement.
|
||||
//@{
|
||||
|
||||
inline static bool in_parallel() { return omp_in_parallel(); }
|
||||
|
||||
/** \brief Set the device in a "sleep" state. A noop for OpenMPTarget. */
|
||||
static bool sleep();
|
||||
|
||||
/** \brief Wake the device from the 'sleep' state. A noop for OpenMPTarget. */
|
||||
static bool wake();
|
||||
|
||||
/** \brief Wait until all dispatched functors complete. A noop for OpenMPTarget. */
|
||||
static void fence() {}
|
||||
|
||||
/// \brief Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool detail = false );
|
||||
|
||||
/// \brief Free any resources being consumed by the device.
|
||||
static void finalize();
|
||||
|
||||
/** \brief Initialize the device.
|
||||
*
|
||||
* 1) If the hardware locality library is enabled and OpenMPTarget has not
|
||||
* already bound threads then bind OpenMPTarget threads to maximize
|
||||
* core utilization and group for memory hierarchy locality.
|
||||
*
|
||||
* 2) Allocate a HostThread for each OpenMPTarget thread to hold its
|
||||
* topology and fan in/out data.
|
||||
*/
|
||||
static void initialize( unsigned thread_count = 0 ,
|
||||
unsigned use_numa_count = 0 ,
|
||||
unsigned use_cores_per_numa = 0 );
|
||||
|
||||
static int is_initialized();
|
||||
|
||||
/** \brief Return the maximum amount of concurrency. */
|
||||
static int concurrency();
|
||||
|
||||
//@}
|
||||
//------------------------------------
|
||||
/** \brief This execution space has a topological thread pool which can be queried.
|
||||
*
|
||||
* All threads within a pool have a common memory space for which they are cache coherent.
|
||||
* depth = 0 gives the number of threads in the whole pool.
|
||||
* depth = 1 gives the number of threads in a NUMA region, typically sharing L3 cache.
|
||||
* depth = 2 gives the number of threads at the finest granularity, typically sharing L1 cache.
|
||||
*/
|
||||
inline static int thread_pool_size( int depth = 0 );
|
||||
|
||||
/** \brief The rank of the executing thread in this thread pool */
|
||||
KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
|
||||
|
||||
//------------------------------------
|
||||
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
private:
|
||||
static bool m_is_initialized;
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Experimental::OpenMPTarget::memory_space
|
||||
, Kokkos::Experimental::OpenMPTarget::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */
|
||||
#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */
|
||||
|
||||
|
||||
265
lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
Normal file
265
lib/kokkos/core/src/Kokkos_OpenMPTargetSpace.hpp
Normal file
@ -0,0 +1,265 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGETSPACE_HPP
|
||||
#define KOKKOS_OPENMPTARGETSPACE_HPP
|
||||
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <iosfwd>
|
||||
#include <typeinfo>
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <omp.h>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
///
|
||||
/// Arbitrary atomics are implemented using a hash table of locks
|
||||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function initializes the locks to zero (unset).
|
||||
//void init_lock_array_host_space();
|
||||
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
//bool lock_address_host_space(void* ptr);
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
//void unlock_address_host_space(void* ptr);
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/// \class OpenMPTargetSpace
|
||||
/// \brief Memory management for host memory.
|
||||
///
|
||||
/// OpenMPTargetSpace is a memory space that governs host memory. "Host"
|
||||
/// memory means the usual CPU-accessible memory.
|
||||
class OpenMPTargetSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef OpenMPTargetSpace memory_space ;
|
||||
typedef size_t size_type ;
|
||||
|
||||
/// \typedef execution_space
|
||||
/// \brief Default execution space for this memory space.
|
||||
///
|
||||
/// Every memory space has a default execution space. This is
|
||||
/// useful for things like initializing a View (which happens in
|
||||
/// parallel using the View's default execution space).
|
||||
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
|
||||
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
OpenMPTargetSpace();
|
||||
OpenMPTargetSpace( OpenMPTargetSpace && rhs ) = default ;
|
||||
OpenMPTargetSpace( const OpenMPTargetSpace & rhs ) = default ;
|
||||
OpenMPTargetSpace & operator = ( OpenMPTargetSpace && ) = default ;
|
||||
OpenMPTargetSpace & operator = ( const OpenMPTargetSpace & ) = default ;
|
||||
~OpenMPTargetSpace() = default ;
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
private:
|
||||
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > ;
|
||||
};
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
friend Kokkos::Experimental::OpenMPTargetSpace ;
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
/**\brief Root record for tracked allocations from this OpenMPTargetSpace instance */
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
const Kokkos::Experimental::OpenMPTargetSpace m_space ;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() = default ;
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const RecordBase::function_type arg_dealloc = & deallocate
|
||||
);
|
||||
|
||||
public:
|
||||
|
||||
std::string get_label() const;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
SharedAllocationRecord * allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::OpenMPTargetSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//TODO: implement all possible deep_copies
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_default_device());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Kokkos::Experimental::OpenMPTargetSpace,HostSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_default_device(), omp_get_initial_device());
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<HostSpace,Kokkos::Experimental::OpenMPTargetSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
omp_target_memcpy( dst , const_cast<void*> (src) , n, 0, 0, omp_get_initial_device(), omp_get_default_device());
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::OpenMPTargetSpace >
|
||||
{
|
||||
enum { value = false };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -44,9 +44,10 @@
|
||||
#ifndef KOKKOS_QTHREADS_HPP
|
||||
#define KOKKOS_QTHREADS_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// Defines to enable experimental Qthreads functionality.
|
||||
#define QTHREAD_LOCAL_PRIORITY
|
||||
@ -150,6 +151,8 @@ public:
|
||||
|
||||
int shepherd_size() const;
|
||||
int shepherd_worker_size() const;
|
||||
|
||||
static const char* name();
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -194,5 +197,5 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
|
||||
|
||||
#endif // #define KOKKOS_ENABLE_QTHREADS
|
||||
|
||||
#endif // #define KOKKOS_QTHREADS_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,9 +44,9 @@
|
||||
#ifndef KOKKOS_SCRATCHSPACE_HPP
|
||||
#define KOKKOS_SCRATCHSPACE_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <cstdio>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <Kokkos_Concepts.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -57,7 +57,7 @@ namespace Kokkos {
|
||||
*/
|
||||
template< class ExecSpace >
|
||||
class ScratchMemorySpace {
|
||||
static_assert (Impl::is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
|
||||
static_assert (is_execution_space<ExecSpace>::value,"Instantiating ScratchMemorySpace on non-execution-space type.");
|
||||
public:
|
||||
|
||||
// Alignment of memory chunks returned by 'get'
|
||||
@ -161,6 +161,3 @@ public:
|
||||
|
||||
#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -47,6 +47,9 @@
|
||||
#ifndef KOKKOS_SERIAL_HPP
|
||||
#define KOKKOS_SERIAL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
@ -63,7 +66,6 @@
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_SERIAL )
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -163,6 +165,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
|
||||
static const char* name();
|
||||
//--------------------------------------------------------------------------
|
||||
};
|
||||
|
||||
@ -515,7 +518,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -798,7 +801,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
@ -821,5 +824,3 @@ public:
|
||||
#endif // defined( KOKKOS_ENABLE_SERIAL )
|
||||
#endif /* #define KOKKOS_SERIAL_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -46,25 +46,10 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
// If compiling with CUDA then must be using CUDA 8 or better
|
||||
// and use relocateable device code to enable the task policy.
|
||||
// nvcc relocatable device code option: --relocatable-device-code=true
|
||||
|
||||
#if ( defined( KOKKOS_ENABLE_CUDA ) )
|
||||
#if ( 8000 <= CUDA_VERSION ) && \
|
||||
defined( KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE )
|
||||
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
|
||||
#endif
|
||||
#else
|
||||
#define KOKKOS_ENABLE_TASKDAG
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_MemoryPool.hpp>
|
||||
@ -371,7 +356,7 @@ struct TaskPolicyData
|
||||
TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( DepFutureType && arg_future
|
||||
TaskPolicyData( DepFutureType const & arg_future
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( 0 )
|
||||
, m_dependence( arg_future )
|
||||
@ -385,6 +370,15 @@ struct TaskPolicyData
|
||||
, m_dependence()
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( scheduler_type const & arg_scheduler
|
||||
, DepFutureType const & arg_future
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( & arg_scheduler )
|
||||
, m_dependence( arg_future )
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -413,6 +407,7 @@ public:
|
||||
|
||||
using execution_space = ExecSpace ;
|
||||
using memory_space = typename queue_type::memory_space ;
|
||||
using memory_pool = typename queue_type::memory_pool ;
|
||||
using member_type =
|
||||
typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
|
||||
|
||||
@ -431,9 +426,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskScheduler & operator = ( TaskScheduler const & rhs ) = default ;
|
||||
|
||||
TaskScheduler( memory_space const & arg_memory_space
|
||||
, unsigned const arg_memory_pool_capacity
|
||||
, unsigned const arg_memory_pool_log2_superblock = 12 )
|
||||
TaskScheduler( memory_pool const & arg_memory_pool )
|
||||
: m_track()
|
||||
, m_queue(0)
|
||||
{
|
||||
@ -442,21 +435,37 @@ public:
|
||||
record_type ;
|
||||
|
||||
record_type * record =
|
||||
record_type::allocate( arg_memory_space
|
||||
record_type::allocate( memory_space()
|
||||
, "TaskQueue"
|
||||
, sizeof(queue_type)
|
||||
);
|
||||
|
||||
m_queue = new( record->data() )
|
||||
queue_type( arg_memory_space
|
||||
, arg_memory_pool_capacity
|
||||
, arg_memory_pool_log2_superblock );
|
||||
m_queue = new( record->data() ) queue_type( arg_memory_pool );
|
||||
|
||||
record->m_destroy.m_queue = m_queue ;
|
||||
|
||||
m_track.assign_allocated_record_to_uninitialized( record );
|
||||
}
|
||||
|
||||
TaskScheduler( memory_space const & arg_memory_space
|
||||
, size_t const mempool_capacity
|
||||
, unsigned const mempool_min_block_size // = 1u << 6
|
||||
, unsigned const mempool_max_block_size // = 1u << 10
|
||||
, unsigned const mempool_superblock_size // = 1u << 12
|
||||
)
|
||||
: TaskScheduler( memory_pool( arg_memory_space
|
||||
, mempool_capacity
|
||||
, mempool_min_block_size
|
||||
, mempool_max_block_size
|
||||
, mempool_superblock_size ) )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
memory_pool * memory() const noexcept
|
||||
{ return m_queue ? m_queue->m_memory : (memory_pool*) 0 ; }
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Allocation size for a spawned task */
|
||||
template< typename FunctorType >
|
||||
@ -502,7 +511,12 @@ public:
|
||||
: (queue_type*) 0 );
|
||||
|
||||
if ( 0 == queue ) {
|
||||
Kokkos::abort("Kokkos spawn given null Future" );
|
||||
Kokkos::abort("Kokkos spawn requires scheduler or non-null Future");
|
||||
}
|
||||
|
||||
if ( arg_policy.m_dependence.m_task != 0 &&
|
||||
arg_policy.m_dependence.m_task->m_queue != queue ) {
|
||||
Kokkos::abort("Kokkos spawn given incompatible scheduler and Future");
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
@ -641,7 +655,7 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocation_capacity() const noexcept
|
||||
{ return m_queue->m_memory.get_mem_size(); }
|
||||
{ return m_queue->m_memory.capacity(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocated_task_count() const noexcept
|
||||
@ -696,6 +710,22 @@ TaskTeam( T const & arg
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
template< typename E , typename F >
|
||||
Kokkos::Impl::
|
||||
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskTeam( TaskScheduler<E> const & arg_scheduler
|
||||
, F const & arg_future
|
||||
, typename std::enable_if< Kokkos::is_future<F>::value ,
|
||||
TaskPriority >::type const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam , F >
|
||||
( arg_scheduler , arg_future , arg_priority );
|
||||
}
|
||||
|
||||
// Construct a TaskSingle execution policy
|
||||
|
||||
template< typename T >
|
||||
@ -721,6 +751,22 @@ TaskSingle( T const & arg
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
template< typename E , typename F >
|
||||
Kokkos::Impl::
|
||||
TaskPolicyData< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskSingle( TaskScheduler<E> const & arg_scheduler
|
||||
, F const & arg_future
|
||||
, typename std::enable_if< Kokkos::is_future<F>::value ,
|
||||
TaskPriority >::type const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle , F >
|
||||
( arg_scheduler , arg_future , arg_priority );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/**\brief A host control thread spawns a task with options
|
||||
@ -849,3 +895,4 @@ void wait( TaskScheduler< ExecSpace > const & scheduler )
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_TASKSCHEDULER_HPP */
|
||||
|
||||
|
||||
@ -44,9 +44,10 @@
|
||||
#ifndef KOKKOS_THREADS_HPP
|
||||
#define KOKKOS_THREADS_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
@ -178,6 +179,7 @@ public:
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
|
||||
|
||||
static const char* name();
|
||||
//@}
|
||||
//----------------------------------------
|
||||
};
|
||||
@ -190,7 +192,7 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::Threads::memory_space
|
||||
, Kokkos::Threads::scratch_memory_space
|
||||
>
|
||||
@ -227,6 +229,6 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
|
||||
#endif /* #define KOKKOS_THREADS_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,7 +44,7 @@
|
||||
#ifndef KOKKOS_TIMER_HPP
|
||||
#define KOKKOS_TIMER_HPP
|
||||
|
||||
#include <stddef.h>
|
||||
#include <cstddef>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#undef KOKKOS_ENABLE_LIBRT
|
||||
@ -110,3 +110,4 @@ public:
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_TIMER_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -51,3 +51,4 @@
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -625,7 +625,7 @@ private:
|
||||
|
||||
#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( ARG ) \
|
||||
View::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check(); \
|
||||
Kokkos::Impl::view_verify_operator_bounds ARG ;
|
||||
Kokkos::Impl::view_verify_operator_bounds< typename traits::memory_space > ARG ;
|
||||
|
||||
#else
|
||||
|
||||
@ -646,12 +646,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator()( Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,args...) )
|
||||
return m_map.reference();
|
||||
}
|
||||
|
||||
@ -669,12 +664,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.reference(i0);
|
||||
}
|
||||
|
||||
@ -690,13 +680,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.m_handle[ i0 ];
|
||||
}
|
||||
|
||||
@ -712,12 +696,7 @@ public:
|
||||
operator()( const I0 & i0
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
|
||||
}
|
||||
|
||||
@ -733,12 +712,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.reference(i0);
|
||||
}
|
||||
|
||||
@ -752,12 +726,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.m_handle[ i0 ];
|
||||
}
|
||||
|
||||
@ -771,12 +740,7 @@ public:
|
||||
), reference_type >::type
|
||||
operator[]( const I0 & i0 ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0) )
|
||||
return m_map.m_handle[ m_map.m_offset.m_stride.S0 * i0 ];
|
||||
}
|
||||
|
||||
@ -794,12 +758,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.reference(i0,i1);
|
||||
}
|
||||
|
||||
@ -815,12 +774,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 + m_map.m_offset.m_dim.N0 * i1 ];
|
||||
}
|
||||
|
||||
@ -836,12 +790,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 + m_map.m_offset.m_stride * i1 ];
|
||||
}
|
||||
|
||||
@ -857,12 +806,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i1 + m_map.m_offset.m_dim.N1 * i0 ];
|
||||
}
|
||||
|
||||
@ -878,12 +822,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i1 + m_map.m_offset.m_stride * i0 ];
|
||||
}
|
||||
|
||||
@ -899,12 +838,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,args...) )
|
||||
return m_map.m_handle[ i0 * m_map.m_offset.m_stride.S0 +
|
||||
i1 * m_map.m_offset.m_stride.S1 ];
|
||||
}
|
||||
@ -923,12 +857,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2) ];
|
||||
}
|
||||
|
||||
@ -943,12 +872,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,args...) )
|
||||
return m_map.reference(i0,i1,i2);
|
||||
}
|
||||
|
||||
@ -966,12 +890,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3) ];
|
||||
}
|
||||
|
||||
@ -986,12 +905,7 @@ public:
|
||||
operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3);
|
||||
}
|
||||
|
||||
@ -1011,12 +925,7 @@ public:
|
||||
, const I4 & i4
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4) ];
|
||||
}
|
||||
|
||||
@ -1033,12 +942,7 @@ public:
|
||||
, const I4 & i4
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4);
|
||||
}
|
||||
|
||||
@ -1058,12 +962,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5) ];
|
||||
}
|
||||
|
||||
@ -1080,12 +979,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5);
|
||||
}
|
||||
|
||||
@ -1105,12 +999,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6) ];
|
||||
}
|
||||
|
||||
@ -1127,12 +1016,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
|
||||
}
|
||||
|
||||
@ -1152,12 +1036,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
return m_map.m_handle[ m_map.m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ];
|
||||
}
|
||||
|
||||
@ -1174,12 +1053,7 @@ public:
|
||||
, const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7
|
||||
, Args ... args ) const
|
||||
{
|
||||
#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (NULL,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#else
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track.template get_label<typename traits::memory_space>().c_str(),m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
#endif
|
||||
|
||||
KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (m_track,m_map,i0,i1,i2,i3,i4,i5,i6,i7,args...) )
|
||||
return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
|
||||
}
|
||||
|
||||
|
||||
@ -9,14 +9,14 @@ default: messages build-lib
|
||||
echo "End Build"
|
||||
|
||||
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
|
||||
CXX = $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper
|
||||
else
|
||||
CXX = g++
|
||||
endif
|
||||
|
||||
CXXFLAGS = -O3
|
||||
LINK ?= $(CXX)
|
||||
LDFLAGS ?=
|
||||
LDFLAGS ?=
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
@ -66,7 +66,7 @@ else
|
||||
KOKKOS_DEBUG_CMAKE = ON
|
||||
endif
|
||||
|
||||
messages:
|
||||
messages:
|
||||
echo "Start Build"
|
||||
|
||||
build-makefile-kokkos:
|
||||
@ -80,13 +80,13 @@ build-makefile-kokkos:
|
||||
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
|
||||
echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
|
||||
echo "CXX ?= $(CXX)" >> Makefile.kokkos
|
||||
echo "CXX ?= $(CXX)" >> Makefile.kokkos
|
||||
echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
|
||||
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Variables used in application Makefiles" >> Makefile.kokkos
|
||||
echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
|
||||
@ -133,11 +133,11 @@ build-cmake-kokkos:
|
||||
echo "else()" >> kokkos.cmake
|
||||
echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
|
||||
echo "endif()" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
|
||||
echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Variables used in application Makefiles" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
@ -164,7 +164,7 @@ build-cmake-kokkos:
|
||||
|
||||
build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
mkdir:
|
||||
mkdir:
|
||||
mkdir -p $(PREFIX)
|
||||
mkdir -p $(PREFIX)/bin
|
||||
mkdir -p $(PREFIX)/include
|
||||
@ -187,7 +187,7 @@ copy-openmp: mkdir
|
||||
mkdir -p $(PREFIX)/include/OpenMP
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
|
||||
|
||||
install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
|
||||
|
||||
@ -41,7 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <cstdio>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
@ -51,7 +54,6 @@
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
@ -82,13 +84,13 @@ bool s_using_hwloc = false;
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
int OpenMPExec::m_map_rank[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
|
||||
int OpenMPExec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
HostThreadTeamData * OpenMPExec::m_pool[ OpenMPExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPexec::verify_is_process( const char * const label )
|
||||
void OpenMPExec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
std::string msg( label );
|
||||
@ -97,7 +99,7 @@ void OpenMPexec::verify_is_process( const char * const label )
|
||||
}
|
||||
}
|
||||
|
||||
void OpenMPexec::verify_initialized( const char * const label )
|
||||
void OpenMPExec::verify_initialized( const char * const label )
|
||||
{
|
||||
if ( 0 == m_pool[0] ) {
|
||||
std::string msg( label );
|
||||
@ -122,7 +124,7 @@ void OpenMPexec::verify_initialized( const char * const label )
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void OpenMPexec::clear_thread_data()
|
||||
void OpenMPExec::clear_thread_data()
|
||||
{
|
||||
const size_t member_bytes =
|
||||
sizeof(int64_t) *
|
||||
@ -149,7 +151,7 @@ void OpenMPexec::clear_thread_data()
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
||||
void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
|
||||
void OpenMPExec::resize_thread_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes )
|
||||
@ -231,7 +233,7 @@ namespace Kokkos {
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int OpenMP::is_initialized()
|
||||
{ return 0 != Impl::OpenMPexec::m_pool[0]; }
|
||||
{ return 0 != Impl::OpenMPExec::m_pool[0]; }
|
||||
|
||||
void OpenMP::initialize( unsigned thread_count ,
|
||||
unsigned use_numa_count ,
|
||||
@ -257,7 +259,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
|
||||
static int omp_max_threads = nthreads;
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
|
||||
bool thread_spawn_failed = false ;
|
||||
|
||||
@ -270,7 +272,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
|
||||
( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
|
||||
|
||||
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
|
||||
std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPExec::MAX_THREAD_COUNT ];
|
||||
|
||||
// If hwloc available then use it's maximum value.
|
||||
|
||||
@ -316,16 +318,16 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
|
||||
: omp_rank ;
|
||||
|
||||
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
Impl::OpenMPExec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
}
|
||||
/* END #pragma omp critical */
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
if ( ! thread_spawn_failed ) {
|
||||
Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
|
||||
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
Impl::OpenMPExec::m_pool_topo[0] = thread_count ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
|
||||
// New, unified host thread team data:
|
||||
{
|
||||
@ -334,7 +336,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
size_t team_shared_bytes = 1024 * thread_count ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
Impl::OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
@ -371,15 +373,15 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
|
||||
void OpenMP::finalize()
|
||||
{
|
||||
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
|
||||
Impl::OpenMPExec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::finalize" );
|
||||
|
||||
// New, unified host thread team data:
|
||||
Impl::OpenMPexec::clear_thread_data();
|
||||
Impl::OpenMPExec::clear_thread_data();
|
||||
|
||||
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
|
||||
Impl::OpenMPexec::m_pool_topo[2] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[1] = 0 ;
|
||||
Impl::OpenMPExec::m_pool_topo[2] = 0 ;
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
@ -396,7 +398,7 @@ void OpenMP::finalize()
|
||||
|
||||
void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
{
|
||||
Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
|
||||
Impl::OpenMPExec::verify_is_process( "OpenMP::print_configuration" );
|
||||
|
||||
s << "Kokkos::OpenMP" ;
|
||||
|
||||
@ -414,12 +416,12 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
;
|
||||
#endif
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
|
||||
const bool is_initialized = 0 != Impl::OpenMPExec::m_pool[0] ;
|
||||
|
||||
if ( is_initialized ) {
|
||||
const int numa_count = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
|
||||
const int numa_count = Kokkos::Impl::OpenMPExec::m_pool_topo[0] / Kokkos::Impl::OpenMPExec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPExec::m_pool_topo[1] / Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPExec::m_pool_topo[2] ;
|
||||
|
||||
s << " thread_pool_topology[ " << numa_count
|
||||
<< " x " << core_per_numa
|
||||
@ -428,7 +430,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
<< std::endl ;
|
||||
|
||||
if ( detail ) {
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPExec::m_pool_topo[0] );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
@ -442,7 +444,7 @@ void OpenMP::print_configuration( std::ostream & s , const bool detail )
|
||||
|
||||
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
|
||||
s << " thread omp_rank[" << i << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPExec::m_map_rank[ i ] << "]"
|
||||
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
|
||||
<< std::endl ;
|
||||
}
|
||||
@ -457,6 +459,11 @@ int OpenMP::concurrency() {
|
||||
return thread_pool_size(0);
|
||||
}
|
||||
|
||||
const char* OpenMP::name() { return "OpenMP"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_OPENMP_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif //KOKKOS_ENABLE_OPENMP
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_OPENMPEXEC_HPP
|
||||
#define KOKKOS_OPENMPEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
@ -66,7 +69,7 @@ namespace Impl {
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Data for OpenMP thread execution */
|
||||
|
||||
class OpenMPexec {
|
||||
class OpenMPExec {
|
||||
public:
|
||||
|
||||
friend class Kokkos::OpenMP ;
|
||||
@ -324,17 +327,21 @@ public:
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
inline
|
||||
bool OpenMP::in_parallel()
|
||||
{ return omp_in_parallel(); }
|
||||
|
||||
inline
|
||||
int OpenMP::thread_pool_size( int depth )
|
||||
{
|
||||
return Impl::OpenMPexec::pool_size(depth);
|
||||
return Impl::OpenMPExec::pool_size(depth);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMP::thread_pool_rank()
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
|
||||
return Impl::OpenMPExec::m_map_rank[ omp_get_thread_num() ];
|
||||
#else
|
||||
return -1 ;
|
||||
#endif
|
||||
@ -342,4 +349,6 @@ int OpenMP::thread_pool_rank()
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,9 +44,12 @@
|
||||
#ifndef KOKKOS_OPENMP_PARALLEL_HPP
|
||||
#define KOKKOS_OPENMP_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
|
||||
#include <omp.h>
|
||||
#include <iostream>
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Exec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -58,7 +61,7 @@ namespace Impl {
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::OpenMP
|
||||
, Kokkos::OpenMP
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -77,7 +80,7 @@ private:
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -94,7 +97,7 @@ private:
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -111,12 +114,12 @@ public:
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
@ -200,11 +203,6 @@ private:
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update );
|
||||
}
|
||||
@ -218,11 +216,6 @@ private:
|
||||
, reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update );
|
||||
}
|
||||
@ -235,13 +228,13 @@ public:
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_bytes =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
@ -249,7 +242,7 @@ public:
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
@ -282,12 +275,12 @@ public:
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
@ -305,7 +298,7 @@ public:
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ViewType & arg_result_view
|
||||
, const ViewType & arg_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
@ -313,7 +306,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
, m_result_ptr( arg_view.data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -327,7 +320,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -378,11 +371,6 @@ private:
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update , final );
|
||||
}
|
||||
@ -396,11 +384,6 @@ private:
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update , final );
|
||||
}
|
||||
@ -411,13 +394,13 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
const int value_count = Analysis::value_count( m_functor );
|
||||
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
OpenMPExec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
@ -425,7 +408,7 @@ public:
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
|
||||
|
||||
@ -565,22 +548,22 @@ public:
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t pool_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
@ -723,8 +706,8 @@ public:
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPExec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
@ -733,14 +716,14 @@ public:
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
OpenMPExec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
HostThreadTeamData & data = *OpenMPExec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
@ -785,12 +768,12 @@ public:
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
const pointer_type ptr = pointer_type( OpenMPExec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
for ( int i = 1 ; i < OpenMPExec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
, OpenMPExec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
@ -830,7 +813,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >
|
||||
@ -849,5 +832,6 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,15 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
@ -110,21 +111,27 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
||||
static task_root_type * const end =
|
||||
(task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
|
||||
HostThreadTeamData & team_data_single =
|
||||
HostThreadTeamDataSingleton::singleton();
|
||||
|
||||
const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
|
||||
const int team_size = Impl::OpenMPExec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
OpenMPExec::resize_thread_data( 0 /* global reduce buffer */
|
||||
, 512 * team_size /* team reduce buffer */
|
||||
, 0 /* team shared buffer */
|
||||
, 0 /* thread local buffer */
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPExec::get_thread_data();
|
||||
|
||||
// Organizing threads into a team performs a barrier across the
|
||||
// entire pool to insure proper initialization of the team
|
||||
@ -164,7 +171,7 @@ fflush(stdout);
|
||||
if ( 0 != task && end != task ) {
|
||||
// team member #0 completes the previously executed task,
|
||||
// completion may delete the task
|
||||
queue->complete( task );
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// If 0 == m_ready_count then set task = 0
|
||||
@ -301,7 +308,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
@ -310,7 +317,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_OPENMP_KOKKOS_OPENMP_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,6 +44,7 @@
|
||||
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
306
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
Normal file
306
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
Normal file
@ -0,0 +1,306 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <omp.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <memory.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
|
||||
#include <Kokkos_OpenMPTargetSpace.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
/* Default allocation mechanism */
|
||||
OpenMPTargetSpace::OpenMPTargetSpace()
|
||||
{}
|
||||
|
||||
void * OpenMPTargetSpace::allocate( const size_t arg_alloc_size ) const
|
||||
{
|
||||
static_assert( sizeof(void*) == sizeof(uintptr_t)
|
||||
, "Error sizeof(void*) != sizeof(uintptr_t)" );
|
||||
|
||||
void * ptr;
|
||||
|
||||
ptr = omp_target_alloc( arg_alloc_size, omp_get_default_device());
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
void OpenMPTargetSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
|
||||
{
|
||||
if ( arg_alloc_ptr ) {
|
||||
|
||||
omp_target_free( arg_alloc_ptr , omp_get_default_device() );
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
SharedAllocationRecord< void , void >
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record ;
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
|
||||
, SharedAllocationRecord< void , void >::m_alloc_size
|
||||
);
|
||||
}
|
||||
|
||||
//TODO: Implement deep copy back see CudaSpace
|
||||
std::string
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_label() const {
|
||||
return std::string("OpenMPTargetAllocation");
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >*
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
allocate( const Kokkos::Experimental::OpenMPTargetSpace & arg_space,
|
||||
const std::string & arg_label ,
|
||||
const size_t arg_alloc_size) {
|
||||
return new SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >(arg_space,arg_label,arg_alloc_size);
|
||||
}
|
||||
|
||||
void
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
deallocate( SharedAllocationRecord<void, void> * arg_rec )
|
||||
{
|
||||
delete static_cast<SharedAllocationRecord*>(arg_rec);
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
SharedAllocationRecord( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const SharedAllocationRecord< void , void >::function_type arg_dealloc
|
||||
)
|
||||
// Pass through allocated [ SharedAllocationHeader , user_memory ]
|
||||
// Pass through deallocation function
|
||||
: SharedAllocationRecord< void , void >
|
||||
( & SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::s_root_record
|
||||
, reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
|
||||
, sizeof(SharedAllocationHeader) + arg_alloc_size
|
||||
, arg_dealloc
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
SharedAllocationHeader header;
|
||||
|
||||
header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
|
||||
|
||||
strncpy( header.m_label
|
||||
, arg_label.c_str()
|
||||
, SharedAllocationHeader::maximum_label_length
|
||||
);
|
||||
|
||||
//TODO DeepCopy
|
||||
// DeepCopy
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
allocate_tracked( const Kokkos::Experimental::OpenMPTargetSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
//Kokkos::Impl::DeepCopy<OpenMPTargetSpace,OpenMPTargetSpace>( r_new->data() , r_old->data()
|
||||
// , std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
typedef SharedAllocationHeader Header ;
|
||||
typedef SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void > RecordHost ;
|
||||
|
||||
SharedAllocationHeader const * const head = alloc_ptr ? Header::get_header( alloc_ptr ) : (SharedAllocationHeader *)0 ;
|
||||
RecordHost * const record = head ? static_cast< RecordHost * >( head->m_record ) : (RecordHost *) 0 ;
|
||||
|
||||
if ( ! alloc_ptr || record->m_alloc_ptr != head ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
return record ;
|
||||
}
|
||||
|
||||
// Iterate records to print orphaned memory ...
|
||||
void SharedAllocationRecord< Kokkos::Experimental::OpenMPTargetSpace , void >::
|
||||
print_records( std::ostream & s , const Kokkos::Experimental::OpenMPTargetSpace & space , bool detail )
|
||||
{
|
||||
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "OpenMPTargetSpace" , & s_root_record , detail );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class >
|
||||
struct ViewOperatorBoundsErrorAbort ;
|
||||
|
||||
template<>
|
||||
struct ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace > {
|
||||
static void apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 );
|
||||
};
|
||||
|
||||
void ViewOperatorBoundsErrorAbort< Kokkos::Experimental::OpenMPTargetSpace >::
|
||||
apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 )
|
||||
{
|
||||
printf( "View operator bounds error : rank(%lu) dim(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu) index(%lu,%lu,%lu,%lu,%lu,%lu,%lu,%lu)"
|
||||
, rank , n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7
|
||||
, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
|
||||
//Kokkos::Impl::throw_runtime_exception( buffer );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*
|
||||
namespace Kokkos {
|
||||
namespace {
|
||||
const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
|
||||
const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
|
||||
static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
|
||||
}
|
||||
|
||||
namespace Impl {
|
||||
void init_lock_array_host_space() {
|
||||
static int is_initialized = 0;
|
||||
if(! is_initialized)
|
||||
for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
|
||||
HOST_SPACE_ATOMIC_LOCKS[i] = 0;
|
||||
}
|
||||
|
||||
bool lock_address_host_space(void* ptr) {
|
||||
return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0 , 1);
|
||||
}
|
||||
|
||||
void unlock_address_host_space(void* ptr) {
|
||||
atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
|
||||
(( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
|
||||
0);
|
||||
}
|
||||
|
||||
}
|
||||
}*/
|
||||
273
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Normal file
273
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Normal file
@ -0,0 +1,273 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <limits>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <iostream>
|
||||
#include <impl/Kokkos_CPUDiscovery.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_OPENMPTARGET
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel();
|
||||
|
||||
int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int kokkos_omp_in_parallel()
|
||||
{
|
||||
#ifndef __CUDA_ARCH__
|
||||
return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool s_using_hwloc = false;
|
||||
|
||||
} // namespace
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
bool OpenMPTarget::m_is_initialized = false;
|
||||
}
|
||||
}
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
//int OpenMPTargetExec::m_map_rank[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
//int OpenMPTargetExec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
//OpenMPTargetExec * OpenMPTargetExec::m_pool[ OpenMPTargetExec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPTargetExec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: in parallel" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
}
|
||||
|
||||
void OpenMPTargetExec::verify_initialized( const char * const label )
|
||||
{
|
||||
if ( 0 == Kokkos::Experimental::OpenMPTarget::is_initialized() ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: not initialized" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
if ( omp_get_max_threads() != Kokkos::Experimental::OpenMPTarget::thread_pool_size(0) ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: Initialized but threads modified inappropriately" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void* OpenMPTargetExec::m_scratch_ptr = NULL;
|
||||
int64_t OpenMPTargetExec::m_scratch_size = 0;
|
||||
|
||||
void OpenMPTargetExec::clear_scratch()
|
||||
{
|
||||
Kokkos::Experimental::OpenMPTargetSpace space;
|
||||
space.deallocate(m_scratch_ptr,m_scratch_size);
|
||||
m_scratch_ptr = NULL;
|
||||
m_scratch_size = NULL;
|
||||
}
|
||||
|
||||
void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; }
|
||||
|
||||
void OpenMPTargetExec::resize_scratch( int64_t reduce_bytes ,
|
||||
int64_t team_reduce_bytes,
|
||||
int64_t team_shared_bytes, int64_t thread_local_bytes)
|
||||
{
|
||||
Kokkos::Experimental::OpenMPTargetSpace space;
|
||||
uint64_t total_size = MAX_ACTIVE_TEAMS * reduce_bytes + // Inter Team Reduction
|
||||
MAX_ACTIVE_TEAMS * team_reduce_bytes + // Intra Team Reduction
|
||||
MAX_ACTIVE_TEAMS * team_shared_bytes + // Team Local Scratch
|
||||
MAX_ACTIVE_THREADS * thread_local_bytes; // Thread Private Scratch
|
||||
|
||||
if( total_size > m_scratch_size ) {
|
||||
space.deallocate(m_scratch_ptr,m_scratch_size);
|
||||
m_scratch_size = total_size;
|
||||
m_scratch_ptr = space.allocate(total_size);
|
||||
}
|
||||
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int OpenMPTarget::is_initialized()
|
||||
{ return m_is_initialized; }// != Impl::OpenMPTargetExec::m_pool[0]; }
|
||||
|
||||
void OpenMPTarget::initialize( unsigned thread_count ,
|
||||
unsigned use_numa_count ,
|
||||
unsigned use_cores_per_numa )
|
||||
{
|
||||
// Before any other call to OMP query the maximum number of threads
|
||||
// and save the value for re-initialization unit testing.
|
||||
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Kokkos::Impl::init_lock_array_host_space();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PROFILING
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
m_is_initialized = true;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void OpenMPTarget::finalize()
|
||||
{
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_initialized( "OpenMPTarget::finalize" );
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::finalize" );
|
||||
|
||||
m_is_initialized = false;
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
if ( Kokkos::Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
|
||||
hwloc::unbind_this_thread();
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PROFILING
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void OpenMPTarget::print_configuration( std::ostream & s , const bool detail )
|
||||
{
|
||||
Kokkos::Impl::OpenMPTargetExec::verify_is_process( "OpenMPTarget::print_configuration" );
|
||||
/*
|
||||
s << "Kokkos::Experimental::OpenMPTarget" ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET )
|
||||
s << " KOKKOS_ENABLE_OPENMPTARGET" ;
|
||||
#endif
|
||||
#if defined( KOKKOS_HAVE_HWLOC )
|
||||
|
||||
const unsigned numa_count_ = Kokkos::hwloc::get_available_numa_count();
|
||||
const unsigned cores_per_numa = Kokkos::hwloc::get_available_cores_per_numa();
|
||||
const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
|
||||
|
||||
s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
|
||||
<< " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
|
||||
;
|
||||
#endif
|
||||
|
||||
const bool is_initialized = 0 != Impl::OpenMPTargetExec::m_pool[0] ;
|
||||
|
||||
if ( is_initialized ) {
|
||||
const int numa_count = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] ;
|
||||
const int core_per_numa = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[1] / Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
|
||||
const int thread_per_core = Kokkos::Impl::OpenMPTargetExec::m_pool_topo[2] ;
|
||||
|
||||
s << " thread_pool_topology[ " << numa_count
|
||||
<< " x " << core_per_numa
|
||||
<< " x " << thread_per_core
|
||||
<< " ]"
|
||||
<< std::endl ;
|
||||
|
||||
if ( detail ) {
|
||||
std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPTargetExec::m_pool_topo[0] );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp critical
|
||||
{
|
||||
coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
|
||||
}
|
||||
// END #pragma omp critical
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
|
||||
s << " thread omp_rank[" << i << "]"
|
||||
<< " kokkos_rank[" << Impl::OpenMPTargetExec::m_map_rank[ i ] << "]"
|
||||
<< " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
|
||||
<< std::endl ;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
s << " not initialized" << std::endl ;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
int OpenMPTarget::concurrency() {
|
||||
return thread_pool_size(0);
|
||||
}
|
||||
|
||||
const char* OpenMPTarget::name() { return "OpenMPTarget"; }
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif //KOKKOS_ENABLE_OPENMPTARGET
|
||||
727
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
Normal file
727
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp
Normal file
@ -0,0 +1,727 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGETEXEC_HPP
|
||||
#define KOKKOS_OPENMPTARGETEXEC_HPP
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Data for OpenMPTarget thread execution */
|
||||
|
||||
|
||||
class OpenMPTargetExec {
|
||||
public:
|
||||
enum { MAX_ACTIVE_THREADS = 256*8*56*4 };
|
||||
enum { MAX_ACTIVE_TEAMS = MAX_ACTIVE_THREADS/32 };
|
||||
|
||||
private:
|
||||
static void* scratch_ptr;
|
||||
|
||||
public:
|
||||
static void verify_is_process( const char * const );
|
||||
static void verify_initialized( const char * const );
|
||||
|
||||
static void* get_scratch_ptr();
|
||||
static void clear_scratch();
|
||||
static void resize_scratch( int64_t reduce_bytes , int64_t team_reduce_bytes, int64_t team_shared_bytes, int64_t thread_local_bytes );
|
||||
|
||||
static void* m_scratch_ptr;
|
||||
static int64_t m_scratch_size;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class OpenMPTargetExecTeamMember {
|
||||
public:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
/** \brief Thread states for team synchronization */
|
||||
enum { Active = 0 , Rendezvous = 1 };
|
||||
|
||||
typedef Kokkos::Experimental::OpenMPTarget execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_scratch_size[2] ;
|
||||
int m_team_rank ;
|
||||
int m_team_size ;
|
||||
int m_league_rank ;
|
||||
int m_league_size ;
|
||||
int m_vector_length ;
|
||||
int m_vector_lane ;
|
||||
void* m_glb_scratch ;
|
||||
|
||||
/*
|
||||
// Fan-in team threads, root of the fan-in which does not block returns true
|
||||
inline
|
||||
bool team_fan_in() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
|
||||
}
|
||||
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec.state_set( Rendezvous );
|
||||
memory_fence();
|
||||
m_exec.state_wait( Rendezvous );
|
||||
}
|
||||
|
||||
return 0 == m_team_rank_rev ;
|
||||
}
|
||||
|
||||
inline
|
||||
void team_fan_out() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
|
||||
memory_fence();
|
||||
}
|
||||
}
|
||||
*/
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& thread_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
{
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast(ValueType& value, const int& thread_id) const
|
||||
{
|
||||
/*#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ }
|
||||
#else
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
|
||||
, ValueType , void >::type type ;
|
||||
|
||||
type * const local_value = ((type*) m_exec.scratch_thread());
|
||||
if(team_rank() == thread_id)
|
||||
*local_value = value;
|
||||
memory_fence();
|
||||
team_barrier();
|
||||
value = *local_value;
|
||||
#endif*/
|
||||
}
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION ValueType
|
||||
team_reduce( const ValueType & value
|
||||
, const JoinOp & op_in ) const {
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
typedef ValueType value_type;
|
||||
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
|
||||
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
const int n_values = TEAM_REDUCE_SIZE/sizeof(value_type);
|
||||
type * team_scratch = (type*) ((char*)m_glb_scratch + TEAM_REDUCE_SIZE*omp_get_team_num());
|
||||
for(int i = m_team_rank; i < n_values; i+= m_team_size) {
|
||||
team_scratch[i] = value_type();
|
||||
}
|
||||
|
||||
#pragma omp barrier
|
||||
|
||||
for(int k=0; k<m_team_size; k+=n_values) {
|
||||
if((k <= m_team_rank) && (k+n_values > m_team_rank))
|
||||
team_scratch[m_team_rank%n_values]+=value;
|
||||
#pragma omp barrier
|
||||
}
|
||||
|
||||
for(int d = 1; d<n_values;d*=2) {
|
||||
if((m_team_rank+d<n_values) && (m_team_rank%(2*d)==0)) {
|
||||
team_scratch[m_team_rank] += team_scratch[m_team_rank+d];
|
||||
}
|
||||
#pragma omp barrier
|
||||
}
|
||||
return team_scratch[0];
|
||||
}
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename ArgType >
|
||||
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
|
||||
{
|
||||
/* // Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
|
||||
|
||||
volatile type * const work_value = ((type*) m_exec.scratch_thread());
|
||||
|
||||
*work_value = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
// m_team_base[0] == highest ranking team member
|
||||
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
|
||||
//
|
||||
// 1) copy from lower to higher rank, initialize lowest rank to zero
|
||||
// 2) prefix sum from lowest to highest rank, skipping lowest rank
|
||||
|
||||
type accum = 0 ;
|
||||
|
||||
if ( global_accum ) {
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
accum += val ;
|
||||
}
|
||||
accum = atomic_fetch_add( global_accum , accum );
|
||||
}
|
||||
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
const type offset = accum ;
|
||||
accum += val ;
|
||||
val = offset ;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
return *work_value ;*/
|
||||
return ArgType();
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
|
||||
{ return this-> template team_scan<Type>( value , 0 ); }
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
private:
|
||||
|
||||
typedef execution_space::scratch_memory_space space ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
OpenMPTargetExecTeamMember( const int league_rank, const int league_size, const int team_size, const int vector_length //const TeamPolicyInternal< OpenMPTarget, Properties ...> & team
|
||||
, void* const glb_scratch
|
||||
, const int shmem_size_L1
|
||||
, const int shmem_size_L2
|
||||
)
|
||||
: m_team_shared(0,0)
|
||||
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
|
||||
, m_team_rank(0)
|
||||
, m_vector_length( vector_length )
|
||||
, m_team_size( team_size )
|
||||
, m_league_rank( league_rank )
|
||||
, m_league_size( league_size )
|
||||
, m_glb_scratch( glb_scratch )
|
||||
{
|
||||
const int omp_tid = omp_get_thread_num();
|
||||
m_league_rank = league_rank;
|
||||
m_team_rank = omp_tid/m_vector_length;
|
||||
m_vector_lane = omp_tid%m_vector_length;
|
||||
}
|
||||
|
||||
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
|
||||
};
|
||||
|
||||
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... >: public PolicyTraits<Properties ...>
|
||||
{
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy ;
|
||||
|
||||
typedef PolicyTraits<Properties ... > traits;
|
||||
|
||||
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
|
||||
m_league_size = p.m_league_size;
|
||||
m_team_size = p.m_team_size;
|
||||
m_vector_length = p.m_vector_length;
|
||||
m_team_alloc = p.m_team_alloc;
|
||||
m_team_iter = p.m_team_iter;
|
||||
m_team_scratch_size[0] = p.m_team_scratch_size[0];
|
||||
m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
|
||||
m_team_scratch_size[1] = p.m_team_scratch_size[1];
|
||||
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
|
||||
m_chunk_size = p.m_chunk_size;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return 1024; }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & )
|
||||
{ return 256; }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType &, const int& vector_length)
|
||||
{ return 256/vector_length; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
int m_league_size ;
|
||||
int m_team_size ;
|
||||
int m_vector_length;
|
||||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_team_scratch_size[2];
|
||||
size_t m_thread_scratch_size[2];
|
||||
|
||||
int m_chunk_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request
|
||||
, const int vector_length_request )
|
||||
{
|
||||
m_league_size = league_size_request ;
|
||||
|
||||
m_team_size = team_size_request;
|
||||
|
||||
m_vector_length = vector_length_request;
|
||||
|
||||
set_auto_chunk_size();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline int vector_length() const { return m_vector_length ; }
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size(const int& level, int team_size_ = -1) const {
|
||||
if(team_size_ < 0)
|
||||
team_size_ = m_team_size;
|
||||
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
|
||||
}
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request , vector_length_request); }
|
||||
|
||||
TeamPolicyInternal( typename traits::execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int vector_length_request = 1)
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, int team_size_request
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , team_size_request , vector_length_request); }
|
||||
|
||||
TeamPolicyInternal( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int vector_length_request = 1 )
|
||||
: m_team_scratch_size { 0 , 0 }
|
||||
, m_thread_scratch_size { 0 , 0 }
|
||||
, m_chunk_size(0)
|
||||
{ init( league_size_request , 256/vector_length_request , vector_length_request ); }
|
||||
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
inline int chunk_size() const { return m_chunk_size ; }
|
||||
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
p.m_team_scratch_size[level] = per_team.value;
|
||||
p.m_thread_scratch_size[level] = per_thread.value;
|
||||
return p;
|
||||
};
|
||||
|
||||
private:
|
||||
/** \brief finalize chunk_size if it was set to AUTO*/
|
||||
inline void set_auto_chunk_size() {
|
||||
|
||||
int concurrency = traits::execution_space::thread_pool_size(0)/m_team_alloc;
|
||||
if( concurrency==0 ) concurrency=1;
|
||||
|
||||
if(m_chunk_size > 0) {
|
||||
if(!Impl::is_integral_power_of_two( m_chunk_size ))
|
||||
Kokkos::abort("TeamPolicy blocking granularity must be power of two" );
|
||||
}
|
||||
|
||||
int new_chunk_size = 1;
|
||||
while(new_chunk_size*100*concurrency < m_league_size)
|
||||
new_chunk_size *= 2;
|
||||
if(new_chunk_size < 128) {
|
||||
new_chunk_size = 1;
|
||||
while( (new_chunk_size*40*concurrency < m_league_size ) && (new_chunk_size<128) )
|
||||
new_chunk_size*=2;
|
||||
}
|
||||
m_chunk_size = new_chunk_size;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef Impl::OpenMPTargetExecTeamMember member_type ;
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
inline
|
||||
int OpenMPTarget::thread_pool_size( int depth )
|
||||
{
|
||||
//return Impl::OpenMPTargetExec::pool_size(depth);
|
||||
return omp_get_max_threads();
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int OpenMPTarget::thread_pool_rank()
|
||||
{
|
||||
return omp_get_thread_num();
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember> PerTeam(const Impl::OpenMPTargetExecTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember> PerThread(const Impl::OpenMPTargetExecTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>(thread);
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
|
||||
//result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
|
||||
//init_result = loop_boundaries.thread.team_reduce(result,join);
|
||||
}
|
||||
|
||||
} //namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
init_result = result;
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
* for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
|
||||
* Depending on the target execution space the operator might be called twice: once with final=false
|
||||
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
|
||||
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
|
||||
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
|
||||
* to the final sum value over all vector lanes.
|
||||
* This functionality requires C++11 support.*/
|
||||
template< typename iType, class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >&
|
||||
loop_boundaries, const FunctorType & lambda) {
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
value_type scan_val = value_type();
|
||||
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,scan_val,true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
if(single_struct.team_member.team_rank()==0) lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPTargetExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
if(single_struct.team_member.team_rank()==0) {
|
||||
lambda(val);
|
||||
}
|
||||
single_struct.team_member.team_broadcast(val,0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #ifndef KOKKOS_OPENMPTARGETEXEC_HPP */
|
||||
|
||||
@ -0,0 +1,767 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP
|
||||
#define KOKKOS_OPENMPTARGET_PARALLEL_HPP
|
||||
|
||||
#include <omp.h>
|
||||
#include <iostream>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <OpenMPTarget/Kokkos_OpenMPTarget_Exec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
execute_impl<WorkTag>();
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename Policy::member_type begin = m_policy.begin();
|
||||
const typename Policy::member_type end = m_policy.end();
|
||||
|
||||
#pragma omp target teams distribute parallel for map(to:this->m_functor)
|
||||
for(int i=begin; i<end; i++)
|
||||
m_functor(i);
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename Policy::member_type begin = m_policy.begin();
|
||||
const typename Policy::member_type end = m_policy.end();
|
||||
|
||||
#pragma omp target teams distribute parallel for num_threads(128) map(to:this->m_functor)
|
||||
for(int i=begin; i<end; i++)
|
||||
m_functor(TagType(),i);
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, Policy arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType, int FunctorHasJoin, int UseReducerType>
|
||||
struct ParallelReduceSpecialize {
|
||||
static inline void execute(const FunctorType& f, const PolicyType& p , PointerType result_ptr) {
|
||||
printf("Error: Invalid Specialization %i %i\n",FunctorHasJoin,UseReducerType);
|
||||
}
|
||||
};
|
||||
|
||||
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
|
||||
struct ParallelReduceSpecialize<FunctorType, Kokkos::RangePolicy<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
|
||||
typedef Kokkos::RangePolicy<PolicyArgs...> PolicyType;
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(+: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(+: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(TagType(),i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
/*
|
||||
template<class FunctorType, class PolicyType, class ReducerType, class PointerType, class ValueType>
|
||||
struct ParallelReduceSpecialize<FunctorType, PolicyType, ReducerType, PointerType, ValueType, 0,1> {
|
||||
|
||||
#pragma omp declare reduction(custom: ValueType : ReducerType::join(omp_out, omp_in)) initializer ( ReducerType::init(omp_priv) )
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom:result) reduction(custom: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const typename PolicyType::member_type begin = p.begin();
|
||||
const typename PolicyType::member_type end = p.end();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(512) map(to:f) map(tofrom: result) reduction(custom: result)
|
||||
for(int i=begin; i<end; i++)
|
||||
f(TagType(),i,result);
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
template< class FunctorType , class ReducerType, class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ...>
|
||||
, ReducerType
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
// Static Assert WorkTag void if ReducerType not InvalidType
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
|
||||
enum {UseReducer = is_reducer_type<ReducerType>::value };
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
public:
|
||||
inline void execute() const {
|
||||
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ViewType & arg_result_view
|
||||
, typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
/*
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update , final );
|
||||
}
|
||||
}
|
||||
*/
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
/* OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_scan");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_scan");
|
||||
|
||||
OpenMPTargetExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr =
|
||||
pointer_type( exec.scratch_reduce() ) +
|
||||
ValueTraits::value_count( m_functor );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , ptr ) , false );
|
||||
}
|
||||
|
||||
{
|
||||
const unsigned thread_count = OpenMPTargetExec::pool_size();
|
||||
const unsigned value_count = ValueTraits::value_count( m_functor );
|
||||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
|
||||
|
||||
pointer_type ptr = pointer_type( OpenMPTargetExec::pool_rev(rank_rev)->scratch_reduce() );
|
||||
|
||||
if ( ptr_prev ) {
|
||||
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
|
||||
ValueJoin::join( m_functor , ptr + value_count , ptr );
|
||||
}
|
||||
else {
|
||||
ValueInit::init( m_functor , ptr );
|
||||
}
|
||||
|
||||
ptr_prev = ptr ;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPTargetExec & exec = * OpenMPTargetExec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueOps::reference( ptr ) , true );
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class ... Properties >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
execute_impl<WorkTag>();
|
||||
}
|
||||
|
||||
private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const int league_size = m_policy.league_size();
|
||||
const int team_size = m_policy.team_size();
|
||||
const int vector_length = m_policy.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
|
||||
map(to:this->m_functor,scratch_ptr)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
m_functor(team);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl() const
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
const int league_size = m_policy.league_size();
|
||||
const int team_size = m_policy.team_size();
|
||||
const int vector_length = m_policy.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,Policy::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
#pragma omp target teams distribute parallel for num_teams(league_size) num_threads(team_size*vector_length) schedule(static,1) \
|
||||
map(to:this->m_functor,scratch_ptr)
|
||||
for(int i=0 ; i<league_size ; i++) {
|
||||
typename Policy::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
m_functor(TagType(), team);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
template<class FunctorType, class ReducerType, class PointerType, class ValueType, class ... PolicyArgs>
|
||||
struct ParallelReduceSpecialize<FunctorType, TeamPolicyInternal<PolicyArgs...>, ReducerType, PointerType, ValueType, 0,0> {
|
||||
typedef TeamPolicyInternal<PolicyArgs...> PolicyType;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
|
||||
const int league_size = p.league_size();
|
||||
const int team_size = p.team_size();
|
||||
const int vector_length = p.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
|
||||
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
f(team,result);
|
||||
if(team.m_vector_lane!=0) result = 0;
|
||||
}
|
||||
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
execute_impl(const FunctorType& f, const PolicyType& p, PointerType result_ptr)
|
||||
{
|
||||
OpenMPTargetExec::verify_is_process("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
OpenMPTargetExec::verify_initialized("Kokkos::Experimental::OpenMPTarget parallel_for");
|
||||
|
||||
const int league_size = p.league_size();
|
||||
const int team_size = p.team_size();
|
||||
const int vector_length = p.vector_length();
|
||||
const int nteams = OpenMPTargetExec::MAX_ACTIVE_TEAMS<league_size?OpenMPTargetExec::MAX_ACTIVE_TEAMS:league_size;
|
||||
|
||||
OpenMPTargetExec::resize_scratch(0,PolicyType::member_type::TEAM_REDUCE_SIZE,0,0);
|
||||
void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr();
|
||||
|
||||
ValueType result = ValueType();
|
||||
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(team_size*vector_length) \
|
||||
map(to:f,scratch_ptr) map(tofrom:result) reduction(+: result) schedule(static,1)
|
||||
for(int i=0 ; i<league_size*team_size*vector_length ; i++) {
|
||||
typename PolicyType::member_type team(i/(team_size*vector_length),league_size,team_size,vector_length, scratch_ptr, 0,0);
|
||||
f(TagType(),team,result);
|
||||
if(team.vector_lane!=0) result = 0;
|
||||
}
|
||||
*result_ptr=result;
|
||||
}
|
||||
|
||||
|
||||
inline static
|
||||
void execute(const FunctorType& f, const PolicyType& p, PointerType ptr) {
|
||||
execute_impl<typename PolicyType::work_tag>(f,p,ptr);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class ReducerType, class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Properties ... >
|
||||
, ReducerType
|
||||
, Kokkos::Experimental::OpenMPTarget
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::OpenMPTarget, Properties ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
enum {HasJoin = ReduceFunctorHasJoin<FunctorType>::value };
|
||||
enum {UseReducer = is_reducer_type<ReducerType>::value };
|
||||
|
||||
typedef ParallelReduceSpecialize<FunctorType,Policy,ReducerType,pointer_type,typename ValueTraits::value_type,HasJoin,UseReducer> ParForSpecialize;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const ReducerType m_reducer ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const {
|
||||
ParForSpecialize::execute(m_functor,m_policy,m_result_ptr);
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result ,
|
||||
typename std::enable_if<
|
||||
Kokkos::is_view< ViewType >::value &&
|
||||
!Kokkos::is_reducer_type<ReducerType>::value
|
||||
,void*>::type = NULL)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, Policy arg_policy
|
||||
, const ReducerType& reducer )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Experimental::OpenMPTarget must be a Kokkos::View in HostSpace" );*/
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
const iType increment;
|
||||
|
||||
inline
|
||||
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
|
||||
start( thread_.team_rank() ),
|
||||
end( count ),
|
||||
increment( thread_.team_size() )
|
||||
{}
|
||||
inline
|
||||
TeamThreadRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
|
||||
start( begin_+thread_.team_rank() ),
|
||||
end( end_ ),
|
||||
increment( thread_.team_size() )
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,OpenMPTargetExecTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType start;
|
||||
const iType end;
|
||||
const iType increment;
|
||||
|
||||
inline
|
||||
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& count):
|
||||
start( thread_.m_vector_lane ),
|
||||
end( count ),
|
||||
increment( thread_.m_vector_length )
|
||||
{}
|
||||
inline
|
||||
ThreadVectorRangeBoundariesStruct (const OpenMPTargetExecTeamMember& thread_, const iType& begin_, const iType& end_):
|
||||
start( begin_+thread_.m_vector_lane ),
|
||||
end( end_ ),
|
||||
increment( thread_.m_vector_length )
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>
|
||||
ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, const iType& begin, const iType& end) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPTargetExecTeamMember>(thread,begin,end);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */
|
||||
|
||||
329
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
Normal file
329
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
Normal file
@ -0,0 +1,329 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
TaskExec< Kokkos::Experimental::OpenMPTarget >::
|
||||
TaskExec()
|
||||
: m_self_exec( 0 )
|
||||
, m_team_exec( 0 )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( 0 )
|
||||
, m_team_rank( 0 )
|
||||
, m_team_size( 1 )
|
||||
{
|
||||
}
|
||||
|
||||
TaskExec< Kokkos::Experimental::OpenMPTarget >::
|
||||
TaskExec( Kokkos::Impl::OpenMPTargetExec & arg_exec , int const arg_team_size )
|
||||
: m_self_exec( & arg_exec )
|
||||
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
|
||||
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
|
||||
, m_team_size( arg_team_size )
|
||||
{
|
||||
// This team spans
|
||||
// m_self_exec->pool_rev( team_size * group_rank )
|
||||
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
|
||||
|
||||
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
|
||||
|
||||
sync[0] = int64_t(0) ;
|
||||
sync[1] = int64_t(0) ;
|
||||
|
||||
for ( int i = 0 ; i < m_team_size ; ++i ) {
|
||||
m_sync_value |= int64_t(1) << (8*i);
|
||||
m_sync_mask |= int64_t(3) << (8*i);
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void TaskExec< Kokkos::Experimental::OpenMPTarget >::team_barrier_impl() const
|
||||
{
|
||||
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
|
||||
Kokkos::abort("TaskQueue<OpenMPTarget> scratch_reduce memory too small");
|
||||
}
|
||||
|
||||
// Use team shared memory to synchronize.
|
||||
// Alternate memory locations between barriers to avoid a sequence
|
||||
// of barriers overtaking one another.
|
||||
|
||||
int64_t volatile * const sync =
|
||||
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
|
||||
|
||||
// This team member sets one byte within the sync variable
|
||||
int8_t volatile * const sync_self =
|
||||
((int8_t *) sync) + m_team_rank ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
|
||||
|
||||
while ( m_sync_value != *sync ); // wait for team to arrive
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
++m_sync_step ;
|
||||
|
||||
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
|
||||
m_sync_value ^= m_sync_mask ;
|
||||
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::execute
|
||||
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
// Required: team_size <= 8
|
||||
|
||||
const int team_size = PoolExec::pool_size(2); // Threads per core
|
||||
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
if ( 8 < team_size ) {
|
||||
Kokkos::abort("TaskQueue<OpenMPTarget> unsupported team size");
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
PoolExec & self = *PoolExec::get_thread_omp();
|
||||
|
||||
Member single_exec ;
|
||||
Member team_exec( self , team_size );
|
||||
|
||||
// Team shared memory
|
||||
task_root_type * volatile * const task_shared =
|
||||
(task_root_type **) team_exec.m_team_exec->scratch_thread();
|
||||
|
||||
// Barrier across entire OpenMPTarget thread pool to insure initialization
|
||||
#pragma omp barrier
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
task_root_type * task = 0 ;
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or a single thread task for the team.
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Team lead broadcast acquired task to team members:
|
||||
|
||||
if ( 1 < team_exec.team_size() ) {
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
|
||||
|
||||
// Fence to be sure task_shared is stored before the barrier
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Whole team waits for every team member to reach this statement
|
||||
team_exec.team_barrier();
|
||||
|
||||
// Fence to be sure task_shared is stored
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task = *task_shared ;
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
|
||||
, team_exec.m_group_rank
|
||||
, team_exec.m_team_rank
|
||||
, uintptr_t(task_shared)
|
||||
, uintptr_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 0 == task ) break ; // 0 == m_ready_count
|
||||
|
||||
if ( end == task ) {
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
else if ( task_root_type::TaskTeam == task->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task->m_apply)( task , & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
// team member #0 completes the task, which may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single Thread Task
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Not necessary to complete the task.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >::
|
||||
iff_single_thread_recursive_execute
|
||||
( TaskQueue< Kokkos::Experimental::OpenMPTarget > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec ;
|
||||
|
||||
task_root_type * task = end ;
|
||||
|
||||
do {
|
||||
|
||||
task = end ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( end == task ) break ;
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
}
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
||||
356
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
Normal file
356
lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp
Normal file
@ -0,0 +1,356 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
#define KOKKOS_IMPL_OPENMP_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Experimental::OpenMPTarget ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const );
|
||||
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::Experimental::OpenMPTarget
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::Experimental::OpenMPTarget >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
using PoolExec = Kokkos::Impl::OpenMPTargetExec ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Experimental::OpenMPTarget > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Experimental::OpenMPTarget > ;
|
||||
|
||||
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
|
||||
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
|
||||
int64_t m_sync_mask ;
|
||||
int64_t mutable m_sync_value ;
|
||||
int mutable m_sync_step ;
|
||||
int m_group_rank ; ///< Which "team" subset of thread pool
|
||||
int m_team_rank ; ///< Which thread within a team
|
||||
int m_team_size ;
|
||||
|
||||
TaskExec();
|
||||
TaskExec( PoolExec & arg_exec , int arg_team_size );
|
||||
|
||||
void team_barrier_impl() const ;
|
||||
|
||||
public:
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void * team_shared() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
|
||||
|
||||
int team_shared_size() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
|
||||
|
||||
/**\brief Whole team enters this function call
|
||||
* before any teeam member returns from
|
||||
* this function call.
|
||||
*/
|
||||
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_rank() const { return m_team_rank ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_size() const { return m_team_size ; }
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >
|
||||
TeamThreadRange
|
||||
( Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >
|
||||
TeamThreadRange
|
||||
( Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > & thread
|
||||
, const iType & start
|
||||
, const iType & end )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >(thread,start,end);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iType, class Lambda, typename ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
, ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
shared[0] += shared[i];
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
join(shared[0], shared[i]);
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, local_total;
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
int team_size = loop_boundaries.thread.team_size();
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
|
||||
// Intra-member scan
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
|
||||
shared[team_rank] = accum;
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Member 0 do scan on accumulated totals
|
||||
if (team_rank == 0) {
|
||||
for( iType i = 1; i < team_size; i+=1) {
|
||||
shared[i] += shared[i-1];
|
||||
}
|
||||
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Inter-member scan adding in accumulated totals
|
||||
if (team_rank != 0) { accum = shared[team_rank-1]; }
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Experimental::OpenMPTarget > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
|
||||
|
||||
@ -41,12 +41,13 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
@ -215,6 +216,8 @@ void Qthreads::fence()
|
||||
int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
|
||||
int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
|
||||
|
||||
const char* Qthreads::name() { return "Qthreads"; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -516,4 +519,7 @@ QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMemb
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_SRC_QTHREADS_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_QTHREADSEXEC_HPP
|
||||
#define KOKKOS_QTHREADSEXEC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -637,4 +640,6 @@ public:
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif // #define KOKKOS_QTHREADSEXEC_HPP
|
||||
|
||||
|
||||
@ -44,6 +44,9 @@
|
||||
#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
#define KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
@ -724,4 +727,6 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& sing
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -41,10 +41,10 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -314,7 +314,7 @@ void TaskQueueSpecialization< Kokkos::Qthreads >::
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#else
|
||||
void KOKKOS_SRC_QTHREADS_TASK_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,8 @@
|
||||
#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
#define KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,20 +36,22 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD.
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdexcept>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
@ -448,7 +450,7 @@ TaskPolicy< Kokkos::Qthreads >::
|
||||
TaskPolicy
|
||||
( const unsigned /* arg_task_max_count */
|
||||
, const unsigned /* arg_task_max_size */
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_default_dependence_capacity
|
||||
, const unsigned arg_task_team_size
|
||||
)
|
||||
: m_default_dependence_capacity( arg_task_default_dependence_capacity )
|
||||
@ -484,5 +486,8 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_QTHREADS_KOKKOS_QTHREADS_TASKPOLICY_PREVENT_LINK_ERROR() {}
|
||||
#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -46,6 +46,9 @@
|
||||
#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <stdexcept>
|
||||
@ -69,7 +72,6 @@
|
||||
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -505,7 +507,7 @@ public:
|
||||
|
||||
template< class ValueType >
|
||||
const Future< ValueType , execution_space > &
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
spawn( const Future< ValueType , execution_space > & f
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
@ -632,7 +634,7 @@ public:
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
void respawn( FunctorType * task_functor
|
||||
void respawn( FunctorType * task_functor
|
||||
, const bool priority = false ) const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
@ -41,7 +41,11 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#ifndef KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
#define KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -60,7 +64,7 @@ private:
|
||||
using execution_space = Kokkos::Qthread ;
|
||||
using memory_space = Kokkos::HostSpace
|
||||
using device_type = Kokkos::Device< execution_space, memory_space > ;
|
||||
using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ;
|
||||
using memory_pool = Kokkos::MemoryPool< device_type > ;
|
||||
using task_root_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
|
||||
|
||||
friend class Kokkos::TaskScheduler< execution_space > ;
|
||||
@ -317,3 +321,5 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif // KOKKOS_QTHREADS_TASKQUEUE_HPP
|
||||
|
||||
|
||||
@ -41,7 +41,11 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
#ifndef KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
#define KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
@ -433,4 +437,5 @@ fflush(stdout);
|
||||
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif // KOKKOS_QTHREADS_TASKQUEUE_IMPL_HPP
|
||||
|
||||
|
||||
@ -41,11 +41,13 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD )
|
||||
|
||||
#include <stdint.h>
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
@ -512,8 +514,8 @@ void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
|
||||
|
||||
s << "Kokkos::Threads" ;
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
s << " KOKKOS_ENABLE_PTHREAD" ;
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
s << " KOKKOS_ENABLE_THREADS" ;
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_HWLOC )
|
||||
s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
|
||||
@ -817,10 +819,12 @@ int Threads::thread_pool_rank()
|
||||
}
|
||||
#endif
|
||||
|
||||
const char* Threads::name() { return "Threads"; }
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) || defined( KOKKOS_ENABLE_WINTHREAD ) */
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_THREADS_EXEC_PREVENT_LINK_ERROR() {}
|
||||
#endif /* #if defined( KOKKOS_ENABLE_THREADS ) */
|
||||
|
||||
|
||||
@ -44,7 +44,10 @@
|
||||
#ifndef KOKKOS_THREADSEXEC_HPP
|
||||
#define KOKKOS_THREADSEXEC_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
@ -626,6 +629,6 @@ inline void Threads::fence()
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADSEXEC_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,18 +36,16 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
/* Standard 'C' Linux libraries */
|
||||
|
||||
#include <pthread.h>
|
||||
@ -154,6 +152,8 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )
|
||||
|
||||
#elif defined( KOKKOS_ENABLE_WINTHREAD )
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
/* Windows libraries */
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
@ -247,9 +247,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#else
|
||||
void KOKKOS_CORE_SRC_THREADS_EXEC_BASE_PREVENT_LINK_ERROR() {}
|
||||
#endif /* end #elif defined( KOKKOS_ENABLE_WINTHREAD ) */
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,10 @@
|
||||
#ifndef KOKKOS_THREADSTEAM_HPP
|
||||
#define KOKKOS_THREADSTEAM_HPP
|
||||
|
||||
#include <stdio.h>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
@ -173,7 +176,9 @@ public:
|
||||
}
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< !Kokkos::is_reducer< Type >::value , Type>::type
|
||||
team_reduce( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
@ -202,6 +207,55 @@ public:
|
||||
}
|
||||
#endif
|
||||
|
||||
template< typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
team_reduce( const ReducerType & ) const
|
||||
{}
|
||||
#else
|
||||
team_reduce( const ReducerType & reducer ) const
|
||||
{
|
||||
typedef typename ReducerType::value_type value_type;
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
if ( 0 == m_exec ) return ;
|
||||
|
||||
type * const local_value = ((type*) m_exec->scratch_memory());
|
||||
|
||||
// Set this thread's contribution
|
||||
*local_value = reducer.reference() ;
|
||||
|
||||
// Fence to make sure the base team member has access:
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
type * const team_value = ((type*) m_team_base[0]->scratch_memory());
|
||||
|
||||
// Join to the team value:
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
reducer.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
|
||||
}
|
||||
|
||||
// Team base thread may "lap" member threads so copy out to their local value.
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
*((type*) m_team_base[i]->scratch_memory()) = *team_value ;
|
||||
}
|
||||
|
||||
// Fence to make sure all team members have access
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
// Value was changed by the team base
|
||||
reducer.reference() = *((type volatile const *) local_value);
|
||||
}
|
||||
#endif
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION ValueType
|
||||
team_reduce( const ValueType & value
|
||||
@ -760,7 +814,8 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thread
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
@ -774,6 +829,21 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Thr
|
||||
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const ReducerType& reducer) {
|
||||
|
||||
reducer.init(reducer.reference());
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_reduce(reducer);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -822,17 +892,26 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Thre
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
typename std::enable_if< !Kokkos::is_reducer< ValueType >::value >::type
|
||||
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ReducerType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< Kokkos::is_reducer< ReducerType >::value >::type
|
||||
parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const ReducerType& reducer) {
|
||||
reducer.init(reducer.reference());
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,reducer.reference());
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -915,6 +994,6 @@ void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADSTEAM_HPP */
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -44,8 +44,11 @@
|
||||
#ifndef KOKKOS_THREADS_PARALLEL_HPP
|
||||
#define KOKKOS_THREADS_PARALLEL_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ENABLE_THREADS )
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include <iostream>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
@ -83,7 +86,7 @@ private:
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -99,7 +102,7 @@ private:
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -308,7 +311,7 @@ private:
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -325,7 +328,7 @@ private:
|
||||
, reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -348,7 +351,7 @@ private:
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
|
||||
@ -428,7 +431,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -543,7 +546,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_result_ptr( reducer.view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
@ -584,7 +587,7 @@ private:
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -601,7 +604,7 @@ private:
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
#if defined( KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_ENABLE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
@ -654,5 +657,6 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
|
||||
|
||||
|
||||
@ -44,24 +44,20 @@
|
||||
#ifndef KOKKOS_HOST_EXP_ITERATE_TILE_HPP
|
||||
#define KOKKOS_HOST_EXP_ITERATE_TILE_HPP
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <stdio.h>
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#define KOKKOS_MDRANGE_IVDEP
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef KOKKOS_MDRANGE_IVDEP
|
||||
#define KOKKOS_ENABLE_IVDEP_MDRANGE _Pragma("ivdep")
|
||||
#else
|
||||
#define KOKKOS_ENABLE_IVDEP_MDRANGE
|
||||
#endif
|
||||
|
||||
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cstdio>
|
||||
|
||||
namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
|
||||
@ -243,7 +239,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
APPLY( func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_2( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -255,7 +251,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
LOOP_R_1( func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_3( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -267,7 +263,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
LOOP_R_2( func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_4( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -279,7 +275,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
LOOP_R_3( func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_5( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -291,7 +287,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
LOOP_R_4( func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_6( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -303,7 +299,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
LOOP_R_5( func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_7( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -315,7 +311,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
LOOP_R_6( func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_8( func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -327,7 +323,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
LOOP_R_7( func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TILE_LOOP_1( func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -459,7 +455,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
APPLY_REDUX( val, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_2_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -471,7 +467,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
LOOP_R_1_REDUX( val, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_3_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -483,7 +479,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
LOOP_R_2_REDUX( val, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_4_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -495,7 +491,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
LOOP_R_3_REDUX( val, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_5_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -507,7 +503,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
LOOP_R_4_REDUX( val, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_6_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -519,7 +515,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
LOOP_R_5_REDUX( val, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_7_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -531,7 +527,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
LOOP_R_6_REDUX( val, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define LOOP_LAYOUT_8_REDUX( val, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -543,7 +539,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
LOOP_R_7_REDUX( val, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TILE_LOOP_1_REDUX( val, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -676,7 +672,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
TAGGED_APPLY( tag, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_2( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -688,7 +684,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
TAGGED_LOOP_R_1( tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_3( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -700,7 +696,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
TAGGED_LOOP_R_2( tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_4( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -712,7 +708,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
TAGGED_LOOP_R_3( tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_5( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -724,7 +720,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
TAGGED_LOOP_R_4( tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_6( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -736,7 +732,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
TAGGED_LOOP_R_5( tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_7( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -748,7 +744,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
TAGGED_LOOP_R_6( tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_8( tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -760,7 +756,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
TAGGED_LOOP_R_7( tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TAGGED_TILE_LOOP_1( tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -892,7 +888,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
KOKKOS_ENABLE_IVDEP_MDRANGE \
|
||||
for( type i0 = (type)0; i0 < static_cast<type>(extent[0]); ++i0) { \
|
||||
TAGGED_APPLY_REDUX( val, tag, func, i0 + m_offset[0] ) \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_2_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -904,7 +900,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i1 = (type)0; i1 < static_cast<type>(extent[0]); ++i1) { \
|
||||
TAGGED_LOOP_R_1_REDUX( val, tag, func, type, m_offset, extent, 1 , i1 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_3_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -916,7 +912,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i2 = (type)0; i2 < static_cast<type>(extent[0]); ++i2) { \
|
||||
TAGGED_LOOP_R_2_REDUX( val, tag, func, type, m_offset, extent, 1 , i2 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_4_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -928,7 +924,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i3 = (type)0; i3 < static_cast<type>(extent[0]); ++i3) { \
|
||||
TAGGED_LOOP_R_3_REDUX( val, tag, func, type, m_offset, extent, 1 , i3 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_5_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -940,7 +936,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i4 = (type)0; i4 < static_cast<type>(extent[0]); ++i4) { \
|
||||
TAGGED_LOOP_R_4_REDUX( val, tag, func, type, m_offset, extent, 1 , i4 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_6_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -952,7 +948,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i5 = (type)0; i5 < static_cast<type>(extent[0]); ++i5) { \
|
||||
TAGGED_LOOP_R_5_REDUX( val, tag, func, type, m_offset, extent, 1 , i5 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_7_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -964,7 +960,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i6 = (type)0; i6 < static_cast<type>(extent[0]); ++i6) { \
|
||||
TAGGED_LOOP_R_6_REDUX( val, tag, func, type, m_offset, extent, 1 , i6 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
#define TAGGED_LOOP_LAYOUT_8_REDUX( val, tag, func, type, is_left, m_offset, extent, rank ) \
|
||||
if (is_left) { \
|
||||
@ -976,7 +972,7 @@ namespace Kokkos { namespace Experimental { namespace Impl {
|
||||
for( type i7 = (type)0; i7 < static_cast<type>(extent[0]); ++i7) { \
|
||||
TAGGED_LOOP_R_7_REDUX( val, tag, func, type, m_offset, extent, 1 , i7 + m_offset[0] ) \
|
||||
} \
|
||||
}
|
||||
}
|
||||
|
||||
// Partial vs Full Tile
|
||||
#define TAGGED_TILE_LOOP_1_REDUX( val, tag, func, type, is_left, cond, m_offset, extent_full, extent_partial, rank ) \
|
||||
@ -1321,8 +1317,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
@ -1332,7 +1328,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
@ -1343,13 +1339,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1361,13 +1357,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
@ -1384,7 +1380,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1396,7 +1392,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
@ -1437,7 +1433,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1490,7 +1486,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1543,7 +1539,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1596,7 +1592,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1649,7 +1645,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1702,7 +1698,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1804,8 +1800,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
else {
|
||||
is_full_tile = false ;
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
partial_tile[i] = (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1
|
||||
: (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 ? (m_rp.m_upper[i] - offset[i])
|
||||
: (m_rp.m_upper[i] - m_rp.m_lower[i]) ; // when single tile encloses range
|
||||
}
|
||||
}
|
||||
@ -1815,7 +1811,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
|
||||
template <int Rank>
|
||||
struct RankTag
|
||||
struct RankTag
|
||||
{
|
||||
typedef RankTag type;
|
||||
enum { value = (int)Rank };
|
||||
@ -1827,13 +1823,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
inline
|
||||
void
|
||||
operator()(IType tile_idx) const
|
||||
{
|
||||
{
|
||||
point_type m_offset;
|
||||
point_type m_tiledims;
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1845,13 +1841,13 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
Tile_Loop_Type< RP::rank, (RP::inner_direction == RP::Left), index_type, Tag >::apply( m_v, m_func, full_tile, m_offset, m_rp.m_tile, m_tiledims );
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
#else
|
||||
template <typename IType>
|
||||
inline
|
||||
void
|
||||
@ -1869,7 +1865,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1881,7 +1877,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
}
|
||||
|
||||
//Check if offset+tiledim in bounds - if not, replace tile dims with the partial tile dims
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
const bool full_tile = check_iteration_bounds(m_tiledims , m_offset) ;
|
||||
|
||||
if (RP::inner_direction == RP::Left) {
|
||||
if ( full_tile ) {
|
||||
@ -1922,7 +1918,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -1975,7 +1971,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2028,7 +2024,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2081,7 +2077,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2134,7 +2130,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2187,7 +2183,7 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
|
||||
|
||||
if (RP::outer_direction == RP::Left) {
|
||||
for (int i=0; i<RP::rank; ++i) {
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
m_offset[i] = (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i] ;
|
||||
tile_idx /= m_rp.m_tile_end[i];
|
||||
}
|
||||
}
|
||||
@ -2352,5 +2348,5 @@ struct MDFunctor< MDRange, Functor, void >
|
||||
|
||||
} } } //end namespace Kokkos::Experimental::Impl
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -195,3 +195,4 @@ struct PolicyTraits
|
||||
|
||||
|
||||
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,10 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_HPP )
|
||||
#define KOKKOS_ATOMIC_ASSEMBLY_HPP
|
||||
namespace Kokkos {
|
||||
@ -110,3 +112,4 @@ namespace Impl {
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
|
||||
#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_DECREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_DECREMENT_HPP
|
||||
|
||||
@ -117,3 +118,4 @@ void atomic_decrement(volatile T* a) {
|
||||
|
||||
} // End of namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
|
||||
#define KOKKOS_ATOMIC_EXCHANGE_HPP
|
||||
|
||||
@ -356,5 +357,3 @@ void atomic_assign( volatile T * const dest , const T val )
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_ADD_HPP
|
||||
|
||||
@ -279,6 +280,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
{
|
||||
while( !Impl::lock_address_host_space( (void*) dest ) );
|
||||
T return_val = *dest;
|
||||
|
||||
// Don't use the following line of code here:
|
||||
//
|
||||
//const T tmp = *dest = return_val + val;
|
||||
@ -292,6 +294,7 @@ T atomic_fetch_add( volatile T * const dest ,
|
||||
const T tmp = *dest;
|
||||
(void) tmp;
|
||||
Impl::unlock_address_host_space( (void*) dest );
|
||||
|
||||
return return_val;
|
||||
}
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_AND_HPP
|
||||
|
||||
@ -124,4 +125,3 @@ void atomic_and(volatile T * const dest, const T src) {
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_OR_HPP
|
||||
|
||||
@ -124,4 +125,3 @@ void atomic_or(volatile T * const dest, const T src) {
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,11 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
|
||||
#define KOKKOS_ATOMIC_FETCH_SUB_HPP
|
||||
|
||||
@ -238,4 +239,3 @@ void atomic_sub(volatile T * const dest, const T src) {
|
||||
#include<impl/Kokkos_Atomic_Assembly.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,10 +36,12 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
|
||||
#define KOKKOS_ATOMIC_GENERIC_HPP
|
||||
#include <Kokkos_Macros.hpp>
|
||||
@ -424,6 +426,6 @@ T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
|
||||
return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -41,6 +41,7 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if defined( KOKKOS_ATOMIC_HPP) && ! defined( KOKKOS_ATOMIC_INCREMENT_HPP )
|
||||
#define KOKKOS_ATOMIC_INCREMENT_HPP
|
||||
|
||||
@ -115,3 +116,4 @@ void atomic_increment(volatile T* a) {
|
||||
|
||||
} // End of namespace Kokkos
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -428,3 +428,4 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,12 +36,13 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
|
||||
#define KOKKOS_ATOMIC_WINDOWS_HPP
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
@ -45,12 +45,38 @@
|
||||
#define KOKKOS_BITOPS_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <stdint.h>
|
||||
#include <cstdint>
|
||||
#include <climits>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Find first zero bit.
|
||||
*
|
||||
* If none then return -1 ;
|
||||
*/
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_first_zero( unsigned i ) noexcept
|
||||
{
|
||||
enum : unsigned { full = ~0u };
|
||||
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return full != i ? __ffs( ~i ) - 1 : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return full != i ? _bit_scan_forward( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return full != i ? __cnttz4( ~i ) : -1 ;
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
|
||||
#else
|
||||
int offset = -1 ;
|
||||
if ( full != i ) {
|
||||
for ( offset = 0 ; i & ( 1 << offset ) ; ++offset );
|
||||
}
|
||||
return offset ;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
int bit_scan_forward( unsigned i )
|
||||
{
|
||||
@ -121,7 +147,16 @@ int bit_count( unsigned i )
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
unsigned integral_power_of_two_that_contains( const unsigned N )
|
||||
{
|
||||
const unsigned i = Kokkos::Impl::bit_scan_reverse( N );
|
||||
return ( (1u << i) < N ) ? i + 1 : i ;
|
||||
}
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_BITOPS_HPP
|
||||
|
||||
|
||||
@ -122,3 +122,4 @@ int mpi_local_rank_on_node() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
106
lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
Normal file
106
lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp
Normal file
@ -0,0 +1,106 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_CLOCKTIC_HPP
|
||||
#define KOKKOS_CLOCKTIC_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <stdint.h>
|
||||
#include <chrono>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/**\brief Quick query of clock register tics
|
||||
*
|
||||
* Primary use case is to, with low overhead,
|
||||
* obtain a integral value that consistently varies
|
||||
* across concurrent threads of execution within
|
||||
* a parallel algorithm.
|
||||
* This value is often used to "randomly" seed an
|
||||
* attempt to acquire an indexed resource (e.g., bit)
|
||||
* from an array of resources (e.g., bitset) such that
|
||||
* concurrent threads will have high likelihood of
|
||||
* having different index-seed values.
|
||||
*/
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
uint64_t clock_tic(void) noexcept
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
|
||||
// Return value of 64-bit hi-res clock register.
|
||||
|
||||
return clock64();
|
||||
|
||||
#elif defined( __i386__ ) || defined( __x86_64 )
|
||||
|
||||
// Return value of 64-bit hi-res clock register.
|
||||
|
||||
unsigned a = 0, d = 0;
|
||||
|
||||
__asm__ volatile( "rdtsc" : "=a" (a), "=d" (d) );
|
||||
|
||||
return ( (uint64_t) a ) | ( ( (uint64_t) d ) << 32 );
|
||||
|
||||
#elif defined( __powerpc ) || defined( __powerpc__ ) || \
|
||||
defined( __powerpc64__ ) || defined( __POWERPC__ ) || \
|
||||
defined( __ppc__ ) || defined( __ppc64__ )
|
||||
|
||||
unsigned int cycles = 0;
|
||||
|
||||
asm volatile( "mftb %0" : "=r" (cycles) );
|
||||
|
||||
return (uint64_t) cycles;
|
||||
|
||||
#else
|
||||
|
||||
return (uint64_t)
|
||||
std::chrono::high_resolution_clock::now().time_since_epoch().count();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_CLOCKTIC_HPP
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user