Update Kokkos to v2.04.11

This commit is contained in:
Stan Moore
2017-11-06 13:47:33 -07:00
parent 39df9f5d94
commit 16b5315845
117 changed files with 33239 additions and 1093 deletions

View File

@ -1,4 +1,34 @@
# Change Log
## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11)
**Implemented enhancements:**
- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648)
- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060)
- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161)
- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092)
- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142)
- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320)
- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126)
- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108)
- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101)
- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064)
- Fix ROCm: Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038)
- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004)
- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809)
- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
**Fixed bugs:**
- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134)
- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177)
- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154)
- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140)
- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139)
- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128)
- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113)
- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)

View File

@ -33,6 +33,24 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead,
# for compatibility with Kokkos' Makefile build system.
if (TPL_ENABLE_CUDA)
if (DEFINED CUDA_VERSION)
# there is a VERSION_GREATER_EQUAL, but only in CMake >= 3.7
if (CUDA_VERSION VERSION_EQUAL "7.5")
set(KOKKOS_HAVE_CUDA_GEQ_75 TRUE)
endif()
if (CUDA_VERSION VERSION_GREATER "7.5")
set(KOKKOS_HAVE_CUDA_GEQ_75 TRUE)
endif()
if (CUDA_VERSION VERSION_EQUAL "8.0")
set(KOKKOS_HAVE_CUDA_GEQ_80 TRUE)
endif()
if (CUDA_VERSION VERSION_GREATER "8.0")
set(KOKKOS_HAVE_CUDA_GEQ_80 TRUE)
endif()
endif()
endif()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_DEBUG
KOKKOS_HAVE_DEBUG
@ -51,7 +69,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda
KOKKOS_HAVE_CUDA
"Enable CUDA support in Kokkos."
"${TPL_ENABLE_CUDA}"
"${KOKKOS_HAVE_CUDA_TPL}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
@ -68,11 +86,27 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
OFF
)
set(Kokkos_ENABLE_Cuda_Lambda_DEFAULT OFF)
if (Kokkos_ENABLE_Cuda)
if (KOKKOS_HAVE_CUDA_GEQ_75)
if (CMAKE_CXX_FLAGS MATCHES "-expt-extended-lambda")
set(Kokkos_ENABLE_Cuda_Lambda_DEFAULT ON)
message("-- CUDA version is >= 7.5 and CMAKE_CXX_FLAGS contains -expt-extended-lambda,")
message("-- Kokkos_ENABLE_Cuda_Lambda defaults to ON")
else()
message("-- CMAKE_CXX_FLAGS doesn't contain -expt-extended-lambda,")
message("-- Kokkos_ENABLE_Cuda_Lambda defaults to OFF")
endif()
else()
message("-- CUDA version is < 7.5, Kokkos_ENABLE_Cuda_Lambda defaults to OFF")
endif()
endif()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda_Lambda
KOKKOS_HAVE_CUDA_LAMBDA
"Enable CUDA LAMBDA support in Kokkos."
OFF
"${Kokkos_ENABLE_Cuda_Lambda_DEFAULT}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(

View File

@ -211,12 +211,15 @@ void test_dynamic_view_sort(unsigned int n )
const size_t upper_bound = 2 * n ;
const size_t total_alloc_size = n * sizeof(KeyType) * 1.2 ;
const size_t superblock_size = std::min(total_alloc_size, size_t(1000000));
typename KeyDynamicViewType::memory_pool
pool( memory_space()
, n * sizeof(KeyType) * 1.2
, 500 /* min block size in bytes */
, 30000 /* max block size in bytes */
, 1000000 /* min superblock size in bytes */
, superblock_size
);
KeyDynamicViewType keys("Keys",pool,upper_bound);
@ -271,8 +274,10 @@ void test_sort(unsigned int N)
{
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
#if !defined(KOKKOS_ENABLE_ROCM)
test_3D_sort<ExecutionSpace,KeyType>(N);
test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
#endif
}
}

View File

@ -0,0 +1,44 @@
KOKKOS_PATH = ${HOME}/kokkos
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
EXE_NAME = "test"
SRC = $(wildcard *.cpp)
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/config/nvcc_wrapper
EXE = ${EXE_NAME}.cuda
KOKKOS_CUDA_OPTIONS = "enable_lambda"
else
CXX = g++
EXE = ${EXE_NAME}.host
endif
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS = -O3
DEPFLAGS = -M
OBJ = $(SRC:.cpp=.o)
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<

View File

@ -0,0 +1,124 @@
#include<Kokkos_Core.hpp>
#include<impl/Kokkos_Timer.hpp>
#include<Kokkos_Random.hpp>
template<class Scalar>
double test_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
Kokkos::View<Scalar*> output("Output",N);
Kokkos::Impl::Timer timer;
for(int r = 0; r<R; r++)
Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
Scalar s = 2;
for(int m=0;m<M;m++) {
for(int k=0;k<K;k++)
s=s*s+s;
const int idx = (i+offsets(i,m))%N;
Kokkos::atomic_add(&output(idx),s);
}
});
Kokkos::fence();
double time = timer.seconds();
return time;
}
template<class Scalar>
double test_no_atomic(int L, int N, int M,int K,int R,Kokkos::View<const int*> offsets) {
Kokkos::View<Scalar*> output("Output",N);
Kokkos::Impl::Timer timer;
for(int r = 0; r<R; r++)
Kokkos::parallel_for(L, KOKKOS_LAMBDA (const int&i) {
Scalar s = 2;
for(int m=0;m<M;m++) {
for(int k=0;k<K;k++)
s=s*s+s;
const int idx = (i+offsets(i,m))%N;
output(idx) += s;
}
});
Kokkos::fence();
double time = timer.seconds();
return time;
}
int main(int argc, char* argv[]) {
Kokkos::initialize(argc,argv);
{
if(argc<8) {
printf("Arguments: L N M D K R T\n");
printf(" L: Number of iterations to run\n");
printf(" N: Length of array to do atomics into\n");
printf(" M: Number of atomics per iteration to do\n");
printf(" D: Distance from index i to do atomics into (randomly)\n");
printf(" K: Number of FMAD per atomic\n");
printf(" R: Number of repeats of the experiments\n");
printf(" T: Type of atomic\n");
printf(" 1 - int\n");
printf(" 2 - long\n");
printf(" 3 - float\n");
printf(" 4 - double\n");
printf(" 5 - complex<double>\n");
printf("Example Input GPU:\n");
printf(" Histogram : 1000000 1000 1 1000 1 10 1\n");
printf(" MD Force : 100000 100000 100 1000 20 10 4\n");
printf(" Matrix Assembly : 100000 1000000 50 1000 20 10 4\n");
Kokkos::finalize();
return 0;
}
int L = atoi(argv[1]);
int N = atoi(argv[2]);
int M = atoi(argv[3]);
int D = atoi(argv[4]);
int K = atoi(argv[5]);
int R = atoi(argv[6]);
int type = atoi(argv[7]);
Kokkos::View<int*> offsets("Offsets",L,M);
Kokkos::Random_XorShift64_Pool<> pool(12371);
Kokkos::fill_random(offsets,pool,D);
double time = 0;
if(type==1)
time = test_atomic<int>(L,N,M,K,R,offsets);
if(type==2)
time = test_atomic<long>(L,N,M,K,R,offsets);
if(type==3)
time = test_atomic<float>(L,N,M,K,R,offsets);
if(type==4)
time = test_atomic<double>(L,N,M,K,R,offsets);
if(type==5)
time = test_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
double time2 = 1;
if(type==1)
time2 = test_no_atomic<int>(L,N,M,K,R,offsets);
if(type==2)
time2 = test_no_atomic<long>(L,N,M,K,R,offsets);
if(type==3)
time2 = test_no_atomic<float>(L,N,M,K,R,offsets);
if(type==4)
time2 = test_no_atomic<double>(L,N,M,K,R,offsets);
if(type==5)
time2 = test_no_atomic<Kokkos::complex<double> >(L,N,M,K,R,offsets);
int size = 0;
if(type==1) size = sizeof(int);
if(type==2) size = sizeof(long);
if(type==3) size = sizeof(float);
if(type==4) size = sizeof(double);
if(type==5) size = sizeof(Kokkos::complex<double>);
printf("%i\n",size);
printf("Time: %s %i %i %i %i %i %i (t_atomic: %e t_nonatomic: %e ratio: %lf )( GUpdates/s: %lf GB/s: %lf )\n",
(type==1)?"int": (
(type==2)?"long": (
(type==3)?"float": (
(type==4)?"double":"complex"))),
L,N,M,D,K,R,time,time2,time/time2,
1.e-9*L*R*M/time, 1.0*L*R*M*2*size/time/1024/1024/1024);
}
Kokkos::finalize();
}

View File

@ -303,9 +303,9 @@ if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
HPCBIND_ERR=/dev/null
HPCBIND_OUT=/dev/null
else
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_INDEX}"
HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_INDEX}"
HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_INDEX}"
> ${HPCBIND_LOG}
fi

View File

@ -10,3 +10,4 @@ tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a
tag: 2.04.11 date: 10:28:2017 master: 54a1330a develop: ed36c017

View File

@ -1,6 +1,6 @@
#!/bin/bash -el
ulimit -c 0
module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none
module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none
KOKKOS_BRANCH=$1
TRILINOS_UPDATE_BRANCH=$2

View File

@ -1,6 +1,6 @@
#!/bin/bash -el
ulimit -c 0
module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none
module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none
KOKKOS_BRANCH=$1
TRILINOS_UPDATE_BRANCH=$2

View File

@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
TEST_TARGETS += test-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o
TARGETS += KokkosContainers_PerformanceTest_ROCm
TEST_TARGETS += test-rocm
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o
TARGETS += KokkosContainers_PerformanceTest_Threads
@ -45,6 +51,9 @@ endif
KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm
KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
@ -54,6 +63,9 @@ KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosContainers_PerformanceTest_Cuda
./KokkosContainers_PerformanceTest_Cuda
test-rocm: KokkosContainers_PerformanceTest_ROCm
./KokkosContainers_PerformanceTest_ROCm
test-threads: KokkosContainers_PerformanceTest_Threads
./KokkosContainers_PerformanceTest_Threads

View File

@ -0,0 +1,113 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ENABLE_ROCM )
#include <cstdint>
#include <string>
#include <iostream>
#include <iomanip>
#include <sstream>
#include <fstream>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <TestDynRankView.hpp>
#include <Kokkos_UnorderedMap.hpp>
#include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp>
namespace Performance {
class rocm : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
}
static void TearDownTestCase()
{
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
#if 0
// issue 1089
TEST_F( rocm, dynrankview_perf )
{
std::cout << "ROCm" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Experimental::ROCm>( 40960 );
}
TEST_F( rocm, global_2_local)
{
std::cout << "ROCm" << std::endl;
std::cout << "size, create, generate, fill, find" << std::endl;
for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
test_global_to_local_ids<Kokkos::Experimental::ROCm>(i);
}
#endif
TEST_F( rocm, unordered_map_performance_near)
{
Perf::run_performance_tests<Kokkos::Experimental::ROCm,true>("rocm-near");
}
TEST_F( rocm, unordered_map_performance_far)
{
Perf::run_performance_tests<Kokkos::Experimental::ROCm,false>("rocm-far");
}
}
#else
void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */

View File

@ -72,8 +72,10 @@ private:
, "DynamicView must be rank-one" );
static_assert( std::is_trivial< typename traits::value_type >::value &&
std::is_same< typename traits::specialize , void >::value
, "DynamicView must have trivial data type" );
std::is_same< typename traits::specialize , void >::value &&
Kokkos::Impl::is_power_of_two
<sizeof(typename traits::value_type)>::value
, "DynamicView must have trivial value_type and sizeof(value_type) is a power-of-two");
template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space

View File

@ -517,7 +517,7 @@ public:
size_type find_attempts = 0;
enum { bounded_find_attempts = 32u };
enum : unsigned { bounded_find_attempts = 32u };
const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
bounded_find_attempts :
m_available_indexes.max_hint();

View File

@ -59,8 +59,8 @@ class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
typedef Scalar value_type;
typedef Scalar* pointer;
typedef const Scalar* const_pointer;
typedef Scalar* reference;
typedef const Scalar* const_reference;
typedef Scalar& reference;
typedef const Scalar& const_reference;
typedef Scalar* iterator;
typedef const Scalar* const_iterator;
@ -73,11 +73,11 @@ private:
public:
#ifdef KOKKOS_ENABLE_CUDA_UVM
KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);};
KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);};
KOKKOS_INLINE_FUNCTION reference operator() (int i) const {return DV::h_view(i);};
KOKKOS_INLINE_FUNCTION reference operator[] (int i) const {return DV::h_view(i);};
#else
inline Scalar& operator() (int i) const {return DV::h_view(i);};
inline Scalar& operator[] (int i) const {return DV::h_view(i);};
inline reference operator() (int i) const {return DV::h_view(i);};
inline reference operator[] (int i) const {return DV::h_view(i);};
#endif
/* Member functions which behave like std::vector functions */
@ -86,7 +86,7 @@ public:
_size = 0;
_extra_storage = 1.1;
DV::modified_host() = 1;
};
}
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
@ -146,25 +146,32 @@ public:
DV::h_view(_size) = val;
_size++;
};
}
void pop_back() {
_size--;
};
}
void clear() {
_size = 0;
}
size_type size() const {return _size;};
size_type size() const {return _size;}
size_type max_size() const {return 2000000000;}
size_type capacity() const {return DV::capacity();};
bool empty() const {return _size==0;};
size_type capacity() const {return DV::capacity();}
bool empty() const {return _size==0;}
iterator begin() const {return &DV::h_view(0);};
iterator begin() const {return &DV::h_view(0);}
iterator end() const {return &DV::h_view(_size);};
iterator end() const {return &DV::h_view(_size);}
reference front() {return DV::h_view(0);}
reference back() {return DV::h_view(_size - 1);}
const_reference front() const {return DV::h_view(0);}
const_reference back() const {return DV::h_view(_size - 1);}
/* std::algorithms wich work originally with iterators, here they are implemented as member functions */

View File

@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
TEST_TARGETS += test-cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
TARGETS += KokkosContainers_UnitTest_ROCm
TEST_TARGETS += test-rocm
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
TARGETS += KokkosContainers_UnitTest_Threads
@ -51,6 +57,9 @@ endif
KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Cuda
KokkosContainers_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_ROCm
KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Threads
@ -63,6 +72,9 @@ KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
test-cuda: KokkosContainers_UnitTest_Cuda
./KokkosContainers_UnitTest_Cuda
test-rocm: KokkosContainers_UnitTest_ROCm
./KokkosContainers_UnitTest_ROCm
test-threads: KokkosContainers_UnitTest_Threads
./KokkosContainers_UnitTest_Threads

View File

@ -131,11 +131,14 @@ struct TestDynamicView
// printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
const size_t total_alloc_size = arg_total_size * sizeof(Scalar) * 1.2 ;
const size_t superblock = std::min( total_alloc_size , size_t(1000000) );
memory_pool_type pool( memory_space()
, arg_total_size * sizeof(Scalar) * 1.2
, total_alloc_size
, 500 /* min block size in bytes */
, 30000 /* max block size in bytes */
, 1000000 /* min superblock size in bytes */
, superblock
);
// printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);

View File

@ -0,0 +1,263 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#ifdef KOKKOS_ENABLE_ROCM
#include <iostream>
#include <iomanip>
#include <cstdint>
#include <gtest/gtest.h>
#include <Kokkos_Core.hpp>
#include <Kokkos_Bitset.hpp>
#include <Kokkos_UnorderedMap.hpp>
#include <Kokkos_Vector.hpp>
#include <TestBitset.hpp>
#include <TestUnorderedMap.hpp>
#include <TestStaticCrsGraph.hpp>
#include <TestVector.hpp>
#include <TestDualView.hpp>
#include <TestDynamicView.hpp>
#include <Kokkos_DynRankView.hpp>
#include <TestDynViewAPI.hpp>
#include <Kokkos_ErrorReporter.hpp>
#include <TestErrorReporter.hpp>
#include <TestViewCtorPropEmbeddedDim.hpp>
//----------------------------------------------------------------------------
namespace Test {
class rocm : public ::testing::Test {
protected:
static void SetUpTestCase()
{
std::cout << std::setprecision(5) << std::scientific;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
}
static void TearDownTestCase()
{
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
};
#if !defined(KOKKOS_ENABLE_ROCM)
//issue 964
TEST_F( rocm , dyn_view_api) {
TestDynViewAPI< double , Kokkos::Experimental::ROCm >();
}
#endif
TEST_F( rocm, viewctorprop_embedded_dim ) {
TestViewCtorProp_EmbeddedDim< Kokkos::Experimental::ROCm >::test_vcpt( 2, 3 );
}
TEST_F( rocm , staticcrsgraph )
{
TestStaticCrsGraph::run_test_graph< Kokkos::Experimental::ROCm >();
TestStaticCrsGraph::run_test_graph2< Kokkos::Experimental::ROCm >();
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 0);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 1000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 10000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 100000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 0);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 1000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 10000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 100000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 0);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 1000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 10000);
TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 100000);
}
#if !defined(KOKKOS_ENABLE_ROCM)
// issue 1089
// same as 130203 (MemPool, static member function link issue
void rocm_test_insert_close( uint32_t num_nodes
, uint32_t num_inserts
, uint32_t num_duplicates
)
{
test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, true);
}
// hcc link error , Referencing function in another module!
void rocm_test_insert_far( uint32_t num_nodes
, uint32_t num_inserts
, uint32_t num_duplicates
)
{
test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, false);
}
void rocm_test_failed_insert( uint32_t num_nodes )
{
test_failed_insert< Kokkos::Experimental::ROCm >( num_nodes );
}
void rocm_test_deep_copy( uint32_t num_nodes )
{
test_deep_copy< Kokkos::Experimental::ROCm >( num_nodes );
}
void rocm_test_vector_combinations(unsigned int size)
{
test_vector_combinations<int,Kokkos::Experimental::ROCm>(size);
}
void rocm_test_dualview_combinations(unsigned int size)
{
test_dualview_combinations<int,Kokkos::Experimental::ROCm>(size);
}
void rocm_test_bitset()
{
test_bitset<Kokkos::Experimental::ROCm>();
}
/*TEST_F( rocm, bitset )
{
rocm_test_bitset();
}*/
#define ROCM_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat ) \
TEST_F( rocm, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \
for (int i=0; i<repeat; ++i) \
rocm_test_insert_##name(num_nodes,num_inserts,num_duplicates); \
}
#define ROCM_FAILED_INSERT_TEST( num_nodes, repeat ) \
TEST_F( rocm, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \
for (int i=0; i<repeat; ++i) \
rocm_test_failed_insert(num_nodes); \
}
#define ROCM_ASSIGNEMENT_TEST( num_nodes, repeat ) \
TEST_F( rocm, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \
for (int i=0; i<repeat; ++i) \
rocm_test_assignment_operators(num_nodes); \
}
#define ROCM_DEEP_COPY( num_nodes, repeat ) \
TEST_F( rocm, UnorderedMap_deep_copy##num_nodes##_##repeat##x) { \
for (int i=0; i<repeat; ++i) \
rocm_test_deep_copy(num_nodes); \
}
#define ROCM_VECTOR_COMBINE_TEST( size ) \
TEST_F( rocm, vector_combination##size##x) { \
rocm_test_vector_combinations(size); \
}
#define ROCM_DUALVIEW_COMBINE_TEST( size ) \
TEST_F( rocm, dualview_combination##size##x) { \
rocm_test_dualview_combinations(size); \
}
//ROCM_DUALVIEW_COMBINE_TEST( 10 )
//ROCM_VECTOR_COMBINE_TEST( 10 )
//ROCM_VECTOR_COMBINE_TEST( 3057 )
//ROCM_INSERT_TEST(close, 100000, 90000, 100, 500)
//ROCM_INSERT_TEST(far, 100000, 90000, 100, 500)
//ROCM_DEEP_COPY( 10000, 1 )
//ROCM_FAILED_INSERT_TEST( 10000, 1000 )
#undef ROCM_INSERT_TEST
#undef ROCM_FAILED_INSERT_TEST
#undef ROCM_ASSIGNEMENT_TEST
#undef ROCM_DEEP_COPY
#undef ROCM_VECTOR_COMBINE_TEST
#undef ROCM_DUALVIEW_COMBINE_TEST
#endif
#if !defined(KOKKOS_ENABLE_ROCM)
//static member function issue
TEST_F( rocm , dynamic_view )
{
// typedef TestDynamicView< double , Kokkos::ROCmUVMSpace >
typedef TestDynamicView< double , Kokkos::Experimental::ROCmSpace >
TestDynView ;
for ( int i = 0 ; i < 10 ; ++i ) {
TestDynView::run( 100000 + 100 * i );
}
}
#endif
#if defined(KOKKOS_CLASS_LAMBDA)
TEST_F(rocm, ErrorReporterViaLambda)
{
TestErrorReporter<ErrorReporterDriverUseLambda<Kokkos::Experimental::ROCm>>();
}
#endif
TEST_F(rocm, ErrorReporter)
{
TestErrorReporter<ErrorReporterDriver<Kokkos::Experimental::ROCm>>();
}
}
#else
void KOKKOS_CONTAINERS_UNIT_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {}
#endif /* #ifdef KOKKOS_ENABLE_ROCM */

View File

@ -83,6 +83,16 @@ void run_test_graph()
ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] );
}
}
// Test row view access
for ( size_t i = 0 ; i < LENGTH ; ++i ) {
auto rowView = hx.rowConst(i);
ASSERT_EQ( rowView.length, graph[i].size() );
for ( size_t j = 0 ; j < rowView.length ; ++j ) {
ASSERT_EQ( rowView.colidx( j ) , graph[i][j] );
ASSERT_EQ( rowView( j ) , graph[i][j] );
}
}
}
template< class Space >
@ -182,5 +192,6 @@ void run_test_graph3(size_t B, size_t N)
ASSERT_FALSE((ne>2*((hx.row_map(hx.numRows())+C*hx.numRows())/B))&&(hx.row_block_offsets(i+1)>hx.row_block_offsets(i)+1));
}
}
} /* namespace TestStaticCrsGraph */

View File

@ -43,6 +43,7 @@ TEST_TARGETS += test-atomic
#
ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_MEMPOOL = test_mempool.o
TARGETS += KokkosCore_PerformanceTest_Mempool
TEST_TARGETS += test-mempool
@ -52,6 +53,7 @@ TEST_TARGETS += test-mempool
OBJ_TASKDAG = test_taskdag.o
TARGETS += KokkosCore_PerformanceTest_TaskDAG
TEST_TARGETS += test-taskdag
endif
#

View File

@ -63,7 +63,7 @@
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
// ------------------------------------------------------------------ //
@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void >
{
// LL
if (RP::inner_direction == RP::Left) {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
m_func(i, j);
} }
*/
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1);
}
@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void >
}
// LR
else {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
m_func(i, j);
} }
*/
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
m_func(offset_0 , offset_1);
}
@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
// Loop over size maxnumblocks until full range covered
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
m_func(Tag(), i, j);
} }
*/
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag>
}
}
else {
/*
index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y;
index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x;
for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) {
for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) {
m_func(Tag(), i, j);
} }
*/
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
m_func(Tag(), offset_0 , offset_1);
}
@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void >
// LL
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void >
// LR
else {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
m_func(offset_0 , offset_1 , offset_2);
}
@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag>
{
if (RP::inner_direction == RP::Left) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag>
}
else {
for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x;
const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) {
for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2);
}
@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void >
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void >
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3);
}
@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag>
const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0];
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
}
@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag>
const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y;
const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) {
for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z;
const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
}
@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void >
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void >
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag>
const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2];
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
}
@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag>
const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z;
const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
}
@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void >
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void >
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag>
const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4];
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
}
@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag>
const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5];
for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
}
@ -1292,7 +1256,7 @@ protected:
const Functor m_func;
};
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif
#endif

View File

@ -63,7 +63,7 @@
#include <typeinfo>
#endif
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
namespace Refactor {
@ -2709,7 +2709,7 @@ private:
// ----------------------------------------------------------------------------------
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif
#endif

View File

@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
//__launch_bounds__(maxTperB, minBperSM)
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_constant_memory()
{
const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
__global__
//__launch_bounds__(maxTperB, minBperSM)
__launch_bounds__(maxTperB, minBperSM)
static void cuda_parallel_launch_local_memory( const DriverType driver )
{
driver();
@ -193,9 +193,14 @@ template < class DriverType
, bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
struct CudaParallelLaunch ;
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>();
cuda_parallel_launch_constant_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > {
}
};
template < class DriverType, class LaunchBounds >
struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, true >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > {
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
// Fence before changing settings and copying closure
Kokkos::Cuda::fence();
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads
else if ( shmem ) {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) );
} else {
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) );
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_constant_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
// Copy functor to constant memory on the device
cudaMemcpyToSymbol(
kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) );
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_constant_memory< DriverType >
<<< grid , block , shmem , stream >>>();
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType
, unsigned int MaxThreadsPerBlock
, unsigned int MinBlocksPerSM >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds< MaxThreadsPerBlock
, MinBlocksPerSM >
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver );
// Invoke the driver function on the device
cuda_parallel_launch_local_memory
< DriverType, MaxThreadsPerBlock, MinBlocksPerSM >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );
Kokkos::Cuda::fence();
#endif
}
}
};
template < class DriverType >
struct CudaParallelLaunch< DriverType
, Kokkos::LaunchBounds<>
, false >
{
inline
CudaParallelLaunch( const DriverType & driver
, const dim3 & grid
, const dim3 & block
, const int shmem
, const cudaStream_t stream = 0 )
{
if ( grid.x && ( block.x * block.y * block.z ) ) {
if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
sizeof( DriverType ) ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
}
if ( CudaTraits::SharedMemoryCapacity < shmem ) {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
#ifndef KOKKOS_ARCH_KEPLER
// On Kepler the L1 has no benefit since it doesn't cache reads
else {
CUDA_SAFE_CALL(
cudaFuncSetCacheConfig
( cuda_parallel_launch_local_memory< DriverType >
, ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 )
) );
}
#endif
KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE();
// Invoke the driver function on the device
cuda_parallel_launch_local_memory< DriverType >
<<< grid , block , shmem , stream >>>( driver );
#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
CUDA_SAFE_CALL( cudaGetLastError() );

View File

@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
// Iterate records to print orphaned memory ...
void
SharedAllocationRecord< Kokkos::CudaSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail )
{
SharedAllocationRecord< void , void > * r = & s_root_record ;
@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, head.m_label
);
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
else {
snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
}
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != & s_root_record );
}
@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
void
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
}
void
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
}

View File

@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
std::string msg = ss.str();
Kokkos::abort( msg.c_str() );
}
if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) {
if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) {
std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
<< compiled_major << "." << compiled_minor
<< " on device with compute capability "
@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
if ( ! m_scratchUnifiedSupported ) {
if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) {
std::cout << "Kokkos::Cuda device "
<< cudaProp.name << " capability "
<< cudaProp.major << "." << cudaProp.minor
@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
}
#ifdef KOKKOS_ENABLE_CUDA_UVM
if(!cuda_launch_blocking()) {
if( Kokkos::show_warnings() && !cuda_launch_blocking() ) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
std::cout << " The code must call Cuda::fence() after each kernel" << std::endl;
@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
bool visible_devices_one=true;
if (env_visible_devices == 0) visible_devices_one=false;
if(!visible_devices_one && !force_device_alloc) {
if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl;

View File

@ -381,12 +381,12 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
using RP = Policy;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
@ -402,7 +402,7 @@ public:
__device__
void operator()(void) const
{
Kokkos::Experimental::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
Kokkos::Impl::Refactor::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag>(m_rp,m_functor).exec_range();
}
@ -858,14 +858,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Cuda
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ;
typedef Kokkos::MDRangePolicy< Traits ... > Policy ;
typedef typename Policy::array_index_type array_index_type;
typedef typename Policy::index_type index_type;
@ -898,7 +898,7 @@ public:
size_type * m_scratch_flags ;
size_type * m_unified_space ;
typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;
// Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
@ -913,7 +913,7 @@ public:
void
exec_range( reference_type update ) const
{
Kokkos::Experimental::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank,Policy,FunctorType,typename Policy::work_tag, reference_type>(m_policy, m_functor, update).exec_range();
}
inline

View File

@ -127,11 +127,11 @@ struct CudaTextureFetch {
template< class CudaMemorySpace >
inline explicit
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
, Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record
)
: m_obj( record.template attach_texture_object< AliasType >() )
: m_obj( record->template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
, m_offset( record->attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
{}
// Texture object spans the entire allocation.
@ -199,7 +199,7 @@ struct CudaLDGFetch {
template< class CudaMemorySpace >
inline explicit
CudaLDGFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
, Kokkos::Impl::SharedAllocationRecord<CudaMemorySpace,void>*
)
: m_ptr( arg_ptr )
{}
@ -285,7 +285,21 @@ public:
// Assignment of texture = non-texture requires creation of a texture object
// which can only occur on the host. In addition, 'get_record' is only valid
// if called in a host execution space
return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
typedef typename Traits::memory_space memory_space ;
typedef typename Impl::SharedAllocationRecord<memory_space,void> record ;
record * const r = arg_tracker.template get_record< memory_space >();
#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
if ( 0 == r ) {
Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
}
#endif
return handle_type( arg_data_ptr , r );
#else
Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
return handle_type();

View File

@ -48,48 +48,50 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::Cuda
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Cuda,
Traits ...
class ParallelFor< FunctorType
, Kokkos::WorkGraphPolicy< Traits ... >
, Kokkos::Cuda
>
{
public:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Cuda, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
typedef ParallelFor<FunctorType, Policy, Kokkos::Cuda> Self ;
private:
template< class TagType >
__device__
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
__device__
__device__ inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
__device__ inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{} ; m_functor( t , w ); }
public:
__device__
inline
void operator()() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
__device__ inline
void operator()() const noexcept
{
if ( 0 == ( threadIdx.y % 16 ) ) {
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
@ -108,9 +110,9 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -55,7 +55,7 @@
#include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
#endif
namespace Kokkos { namespace Experimental {
namespace Kokkos {
// ------------------------------------------------------------------ //
@ -331,11 +331,23 @@ struct MDRangePolicy
}
};
} // namespace Kokkos
// For backward compatibility
namespace Kokkos { namespace Experimental {
using Kokkos::MDRangePolicy;
using Kokkos::Rank;
using Kokkos::Iterate;
} } // end Kokkos::Experimental
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_for - deprecated use parallel_for
// ------------------------------------------------------------------ //
namespace Kokkos { namespace Experimental {
template <typename MDRange, typename Functor, typename Enable = void>
void md_parallel_for( MDRange const& range
, Functor const& f
@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
#endif
@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f);
Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
using range_policy = typename MDRange::impl_range_policy;

View File

@ -81,10 +81,10 @@ struct IndexType
/**\brief Specify Launch Bounds for CUDA execution.
*
* The "best" defaults may be architecture specific.
* If no launch bounds specified then do not set launch bounds.
*/
template< unsigned int maxT = 1024 /* Max threads per block */
, unsigned int minB = 1 /* Min blocks per SM */
template< unsigned int maxT = 0 /* Max threads per block */
, unsigned int minB = 0 /* Min blocks per SM */
>
struct LaunchBounds
{
@ -280,6 +280,9 @@ struct MemorySpaceAccess {
enum { deepcopy = assignable };
};
}} // namespace Kokkos::Impl
namespace Kokkos {
/**\brief Can AccessSpace access MemorySpace ?
*
@ -358,6 +361,13 @@ public:
>::type space ;
};
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
using Kokkos::SpaceAccessibility ; // For backward compatibility
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------

View File

@ -99,13 +99,17 @@ struct InitArguments {
int num_threads;
int num_numa;
int device_id;
bool disable_warnings;
InitArguments( int nt = -1
, int nn = -1
, int dv = -1)
: num_threads( nt )
, num_numa( nn )
, device_id( dv )
, int dv = -1
, bool dw = false
)
: num_threads{ nt }
, num_numa{ nn }
, device_id{ dv }
, disable_warnings{ dw }
{}
};
@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]);
void initialize(const InitArguments& args = InitArguments());
bool is_initialized() noexcept;
bool show_warnings() noexcept;
/** \brief Finalize the spaces that were initialized via Kokkos::initialize */
void finalize();

View File

@ -45,7 +45,6 @@
#define KOKKOS_CRS_HPP
namespace Kokkos {
namespace Experimental {
/// \class Crs
/// \brief Compressed row storage array.
@ -164,7 +163,7 @@ void transpose_crs(
Crs<DataType, Arg1Type, Arg2Type, SizeType>& out,
Crs<DataType, Arg1Type, Arg2Type, SizeType> const& in);
}} // namespace Kokkos::Experimental
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
@ -172,7 +171,6 @@ void transpose_crs(
namespace Kokkos {
namespace Impl {
namespace Experimental {
template <class InCrs, class OutCounts>
class GetCrsTransposeCounts {
@ -277,14 +275,13 @@ class FillCrsTransposeEntries {
}
};
}}} // namespace Kokkos::Impl::Experimental
}} // namespace Kokkos::Impl
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Experimental {
template< class OutCounts,
class DataType,
@ -297,8 +294,7 @@ void get_crs_transpose_counts(
std::string const& name) {
using InCrs = Crs<DataType, Arg1Type, Arg2Type, SizeType>;
out = OutCounts(name, in.numRows());
Kokkos::Impl::Experimental::
GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
Kokkos::Impl::GetCrsTransposeCounts<InCrs, OutCounts> functor(in, out);
}
template< class OutRowMap,
@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts(
InCounts const& in,
std::string const& name) {
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
Kokkos::Impl::Experimental::
CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
Kokkos::Impl::CrsRowMapFromCounts<InCounts, OutRowMap> functor(in, out);
return functor.execute();
}
@ -326,32 +321,37 @@ void transpose_crs(
typedef View<SizeType*, memory_space> counts_type ;
{
counts_type counts;
Kokkos::Experimental::get_crs_transpose_counts(counts, in);
Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts,
Kokkos::get_crs_transpose_counts(counts, in);
Kokkos::get_crs_row_map_from_counts(out.row_map, counts,
"tranpose_row_map");
}
out.entries = decltype(out.entries)("transpose_entries", in.entries.size());
Kokkos::Impl::Experimental::
Kokkos::Impl::
FillCrsTransposeEntries<crs_type, crs_type> entries_functor(in, out);
}
template< class CrsType,
class Functor>
struct CountAndFill {
class Functor,
class ExecutionSpace = typename CrsType::execution_space>
struct CountAndFillBase;
template< class CrsType,
class Functor,
class ExecutionSpace>
struct CountAndFillBase {
using data_type = typename CrsType::size_type;
using size_type = typename CrsType::size_type;
using row_map_type = typename CrsType::row_map_type;
using entries_type = typename CrsType::entries_type;
using counts_type = row_map_type;
CrsType m_crs;
Functor m_functor;
counts_type m_counts;
struct Count {};
KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
inline void operator()(Count, size_type i) const {
m_counts(i) = m_functor(i, nullptr);
}
struct Fill {};
KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
inline void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
@ -363,13 +363,63 @@ struct CountAndFill {
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
using self_type = CountAndFill<CrsType, Functor>;
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
CountAndFillBase(CrsType& crs, Functor const& f):
m_crs(crs),
m_functor(f)
{}
};
#if defined( KOKKOS_ENABLE_CUDA )
template< class CrsType,
class Functor>
struct CountAndFillBase<CrsType, Functor, Kokkos::Cuda> {
using data_type = typename CrsType::size_type;
using size_type = typename CrsType::size_type;
using row_map_type = typename CrsType::row_map_type;
using counts_type = row_map_type;
CrsType m_crs;
Functor m_functor;
counts_type m_counts;
struct Count {};
__device__ inline void operator()(Count, size_type i) const {
m_counts(i) = m_functor(i, nullptr);
}
struct Fill {};
__device__ inline void operator()(Fill, size_type i) const {
auto j = m_crs.row_map(i);
/* we don't want to access entries(entries.size()), even if its just to get its
address and never use it.
this can happen when row (i) is empty and all rows after it are also empty.
we could compare to row_map(i + 1), but that is a read from global memory,
whereas dimension_0() should be part of the View in registers (or constant memory) */
data_type* fill =
(j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
nullptr : (&(m_crs.entries(j)));
m_functor(i, fill);
}
CountAndFillBase(CrsType& crs, Functor const& f):
m_crs(crs),
m_functor(f)
{}
};
#endif
template< class CrsType,
class Functor>
struct CountAndFill : public CountAndFillBase<CrsType, Functor> {
using base_type = CountAndFillBase<CrsType, Functor>;
using typename base_type::data_type;
using typename base_type::size_type;
using typename base_type::counts_type;
using typename base_type::Count;
using typename base_type::Fill;
using entries_type = typename CrsType::entries_type;
using self_type = CountAndFill<CrsType, Functor>;
CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
base_type(crs, f)
{
using execution_space = typename CrsType::execution_space;
m_counts = counts_type("counts", nrows);
this->m_counts = counts_type("counts", nrows);
{
using count_policy_type = RangePolicy<size_type, execution_space, Count>;
using count_closure_type =
@ -377,10 +427,10 @@ struct CountAndFill {
const count_closure_type closure(*this, count_policy_type(0, nrows));
closure.execute();
}
auto nentries = Kokkos::Experimental::
get_crs_row_map_from_counts(m_crs.row_map, m_counts);
m_counts = counts_type();
m_crs.entries = entries_type("entries", nentries);
auto nentries = Kokkos::
get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts);
this->m_counts = counts_type();
this->m_crs.entries = entries_type("entries", nentries);
{
using fill_policy_type = RangePolicy<size_type, execution_space, Fill>;
using fill_closure_type =
@ -388,7 +438,7 @@ struct CountAndFill {
const fill_closure_type closure(*this, fill_policy_type(0, nrows));
closure.execute();
}
crs = m_crs;
crs = this->m_crs;
}
};
@ -398,9 +448,9 @@ void count_and_fill_crs(
CrsType& crs,
typename CrsType::size_type nrows,
Functor const& f) {
Kokkos::Experimental::CountAndFill<CrsType, Functor>(crs, nrows, f);
Kokkos::CountAndFill<CrsType, Functor>(crs, nrows, f);
}
}} // namespace Kokkos::Experimental
} // namespace Kokkos
#endif /* #define KOKKOS_CRS_HPP */

View File

@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg);
* uses variadic templates. Each and any of the template arguments can
* be omitted.
*
* Possible Template arguments and there default values:
* Possible Template arguments and their default values:
* ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled.
* WorkTag (none): Tag which is used as the first argument for the functor operator.
* Schedule<Type> (Schedule<Static>): Scheduling Policy (Dynamic, or Static).
* IndexType<Type> (IndexType<ExecutionSpace::size_type>: Integer Index type used to iterate over the Index space.
* LaunchBounds<int,int> (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation.
* LaunchBounds<unsigned,unsigned> Launch Bounds for CUDA compilation,
* default of LaunchBounds<0,0> indicates no launch bounds specified.
*/
template< class ... Properties>
class TeamPolicy: public

View File

@ -274,18 +274,14 @@
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
#endif
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
#define KOKKOS_MEMORY_ALIGNMENT 64
#endif
#define KOKKOS_RESTRICT __restrict__
#ifndef KOKKOS_ALIGN
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#endif
#ifndef KOKKOS_ALIGN_PTR
#define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size)))
#endif
#ifndef KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 64
#ifndef KOKKOS_IMPL_ALIGN_PTR
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size)))
#endif
#if ( 1400 > KOKKOS_COMPILER_INTEL )
@ -351,6 +347,11 @@
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
#define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size)))
#endif
#endif
//----------------------------------------------------------------------------
@ -426,16 +427,16 @@
//----------------------------------------------------------------------------
// Define Macro for alignment:
#if !defined KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 16
#if ! defined( KOKKOS_MEMORY_ALIGNMENT )
#define KOKKOS_MEMORY_ALIGNMENT 16
#endif
#if !defined( KOKKOS_ALIGN )
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
#define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4
#endif
#if !defined( KOKKOS_ALIGN_PTR )
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
#if !defined( KOKKOS_IMPL_ALIGN_PTR )
#define KOKKOS_IMPL_ALIGN_PTR(size) /* */
#endif
//----------------------------------------------------------------------------

View File

@ -111,6 +111,10 @@ private:
public:
/**\brief The maximum size of a superblock and block */
enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ };
enum : uint32_t { max_block_per_superblock = max_bit_count };
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
@ -206,7 +210,7 @@ public:
const uint32_t * sb_state_ptr = sb_state_array ;
s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")"
<< " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ;
<< " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ;
for ( int32_t i = 0 ; i < m_sb_count
; ++i , sb_state_ptr += m_sb_state_size ) {
@ -215,7 +219,7 @@ public:
const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ;
const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ;
const uint32_t block_count = 1 << block_count_lg2 ;
const uint32_t block_count = 1u << block_count_lg2 ;
const uint32_t block_used = (*sb_state_ptr) & state_used_mask ;
s << "Superblock[ " << i << " / " << m_sb_count << " ] {"
@ -284,43 +288,87 @@ public:
{
const uint32_t int_align_lg2 = 3 ; /* align as int[8] */
const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ;
const uint32_t default_min_block_size = 1u << 6 ; /* 64 bytes */
const uint32_t default_max_block_size = 1u << 12 ;/* 4k bytes */
const uint32_t default_min_superblock_size = 1u << 20 ;/* 1M bytes */
// Constraints and defaults:
// min_block_alloc_size <= max_block_alloc_size
// max_block_alloc_size <= min_superblock_size
// min_superblock_size <= min_total_alloc_size
//--------------------------------------------------
// Default block and superblock sizes:
const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ;
const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ;
if ( 0 == min_block_alloc_size ) {
// Default all sizes:
if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ;
min_superblock_size =
std::min( size_t(default_min_superblock_size)
, min_total_alloc_size );
min_block_alloc_size =
std::min( size_t(default_min_block_size)
, min_superblock_size );
max_block_alloc_size =
std::min( size_t(default_max_block_size)
, min_superblock_size );
}
else if ( 0 == min_superblock_size ) {
// Choose superblock size as minimum of:
// max_block_per_superblock * min_block_size
// max_superblock_size
// min_total_alloc_size
const size_t max_superblock =
min_block_alloc_size * max_block_per_superblock ;
min_superblock_size =
std::min( max_superblock ,
std::min( size_t(max_superblock_size)
, min_total_alloc_size ) );
}
if ( 0 == max_block_alloc_size ) {
max_block_alloc_size = MAX_BLOCK_SIZE ;
// Upper bound of total allocation size
max_block_alloc_size = std::min( size_t(max_block_alloc_size)
, min_total_alloc_size );
// Lower bound of minimum block size
max_block_alloc_size = std::max( max_block_alloc_size
, min_block_alloc_size );
max_block_alloc_size = min_superblock_size ;
}
if ( 0 == min_superblock_size ) {
min_superblock_size = max_block_alloc_size ;
//--------------------------------------------------
// Upper bound of total allocation size
min_superblock_size = std::min( size_t(min_superblock_size)
, min_total_alloc_size );
{
/* Enforce size constraints:
* min_block_alloc_size <= max_block_alloc_size
* max_block_alloc_size <= min_superblock_size
* min_superblock_size <= max_superblock_size
* min_superblock_size <= min_total_alloc_size
* min_superblock_size <= min_block_alloc_size *
* max_block_per_superblock
*/
// Lower bound of maximum block size
min_superblock_size = std::max( min_superblock_size
, max_block_alloc_size );
const size_t max_superblock =
min_block_alloc_size * max_block_per_superblock ;
if ( ( size_t(max_superblock_size) < min_superblock_size ) ||
( min_total_alloc_size < min_superblock_size ) ||
( max_superblock < min_superblock_size ) ||
( min_superblock_size < max_block_alloc_size ) ||
( max_block_alloc_size < min_block_alloc_size ) ) {
#if 1
printf( " MemoryPool min_block_alloc_size(%ld) max_block_alloc_size(%ld) min_superblock_size(%ld) min_total_alloc_size(%ld) ; max_superblock_size(%ld) max_block_per_superblock(%ld)\n"
, min_block_alloc_size
, max_block_alloc_size
, min_superblock_size
, min_total_alloc_size
, size_t(max_superblock_size)
, size_t(max_block_per_superblock)
);
#endif
Kokkos::abort("Kokkos MemoryPool size constraint violation");
}
}
//--------------------------------------------------
// Block and superblock size is power of two:
// Maximum value is 'max_superblock_size'
m_min_block_size_lg2 =
Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size);
@ -331,36 +379,16 @@ public:
m_sb_size_lg2 =
Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size);
// Constraints:
// m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2
// m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) {
m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ;
}
if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) {
m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ;
}
if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) {
m_max_block_size_lg2 = m_min_block_size_lg2 ;
}
if ( m_sb_size_lg2 < m_max_block_size_lg2 ) {
m_sb_size_lg2 = m_max_block_size_lg2 ;
}
// At least 32 minimum size blocks in a superblock
if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) {
m_sb_size_lg2 = m_min_block_size_lg2 + 5 ;
}
{
// number of superblocks is multiple of superblock size that
// can hold min_total_alloc_size.
const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ;
const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ;
m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ;
}
{
// Any superblock can be assigned to the smallest size block
// Size the block bitset to maximum number of blocks
@ -369,6 +397,7 @@ public:
m_sb_state_size =
( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ;
}
// Array of all superblock states
@ -454,7 +483,7 @@ private:
* Restrict lower bound to minimum block size.
*/
KOKKOS_FORCEINLINE_FUNCTION
unsigned get_block_size_lg2( unsigned n ) const noexcept
uint32_t get_block_size_lg2( uint32_t n ) const noexcept
{
const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n );
@ -463,11 +492,12 @@ private:
public:
/* Return 0 for invalid block size */
KOKKOS_INLINE_FUNCTION
uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept
uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept
{
return alloc_size <= (1UL << m_max_block_size_lg2)
? ( 1u << get_block_size_lg2( alloc_size ) )
? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) )
: 0 ;
}
@ -485,14 +515,16 @@ public:
void * allocate( size_t alloc_size
, int32_t attempt_limit = 1 ) const noexcept
{
if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) {
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
}
if ( 0 == alloc_size ) return (void*) 0 ;
void * p = 0 ;
const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size );
if ( block_size_lg2 <= m_max_block_size_lg2 ) {
// Allocation will fit within a superblock
// that has block sizes ( 1 << block_size_lg2 )
@ -577,14 +609,24 @@ public:
// Set the allocated block pointer
p = ((char*)( m_sb_state_array + m_data_offset ))
+ ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+ ( result.first << size_lg2 ); // block memory
+ ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory
+ ( uint64_t(result.first) << size_lg2 ); // block memory
#if 0
printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, alloc_size
, sb_id
, sb_state
, (1u << size_lg2)
, (1u << count_lg2)
, result.first
, result.second );
#endif
break ; // Success
}
// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second );
}
//------------------------------------------------------------------
// Arrive here if failed to acquire a block.
@ -719,12 +761,7 @@ public:
( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) );
}
} // end allocation attempt loop
//--------------------------------------------------------------------
}
else {
Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size");
}
return p ;
}
@ -765,7 +802,7 @@ public:
const uint32_t block_size_lg2 =
m_sb_size_lg2 - ( block_state >> state_shift );
ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) );
ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) );
if ( ok_block_aligned ) {
@ -773,31 +810,70 @@ public:
// mask into superblock and then shift down for block index
const uint32_t bit =
( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ;
const int result =
CB::release( sb_state_array , bit , block_state );
ok_dealloc_once = 0 <= result ;
// printf(" deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n"
// , sb_id
// , result
// , uint32_t(d >> block_size_lg2)
// , *sb_state_array );
#if 0
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, sb_id
, (1u << block_size_lg2)
, (1u << (m_sb_size_lg2 - block_size_lg2))
, bit
, result );
#endif
}
}
if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) {
#if 0
printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once);
printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n"
, (uintptr_t)m_sb_state_array
, (uintptr_t)p
, int(ok_contains)
, int(ok_block_aligned)
, int(ok_dealloc_once) );
#endif
Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer");
}
}
// end deallocate
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
int number_of_superblocks() const noexcept { return m_sb_count ; }
KOKKOS_INLINE_FUNCTION
void superblock_state( int sb_id
, int & block_size
, int & block_count_capacity
, int & block_count_used ) const noexcept
{
block_size = 0 ;
block_count_capacity = 0 ;
block_count_used = 0 ;
if ( Kokkos::Impl::MemorySpaceAccess
< Kokkos::Impl::ActiveExecutionMemorySpace
, base_memory_space >::accessible ) {
// Can access the state array
const uint32_t state =
((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size];
const uint32_t block_count_lg2 = state >> state_shift ;
const uint32_t block_used = state & state_used_mask ;
block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 );
block_count_capacity = 1LU << block_count_lg2 ;
block_count_used = block_used ;
}
}
};
} // namespace Kokkos

View File

@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR
namespace Kokkos {
namespace Impl {
static_assert(
( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) &&
( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) ,
"KOKKOS_MEMORY_ALIGNMENT must be a power of two" );
/** \brief Memory alignment settings
*
* Sets global value for memory alignment. Must be a power of two!
* Enable compatibility of views from different devices with static stride.
* Use compiler flag to enable overwrites.
*/
enum { MEMORY_ALIGNMENT =
#if defined( KOKKOS_MEMORY_ALIGNMENT )
( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
#else
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
#endif
#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD )
enum : unsigned
{ MEMORY_ALIGNMENT = KOKKOS_MEMORY_ALIGNMENT
, MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD
#else
, MEMORY_ALIGNMENT_THRESHOLD = 4
#endif
};
} //namespace Impl
} // namespace Kokkos

View File

@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace
>
{
enum { value = false };
inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
};
} // namespace Experimental
} // namespace Kokkos

View File

@ -619,16 +619,16 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::MDRangePolicy< Traits ... > ,
Kokkos::MDRangePolicy< Traits ... > ,
Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
@ -661,14 +661,14 @@ public:
template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Serial
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -686,7 +686,7 @@ private:
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -408,7 +408,7 @@ view_alloc( Args const & ... args )
}
template< class ... Args >
inline
KOKKOS_INLINE_FUNCTION
Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... >
view_wrap( Args const & ... args )
{
@ -1216,6 +1216,13 @@ public:
m_track.assign_allocated_record_to_uninitialized( record );
}
KOKKOS_INLINE_FUNCTION
void assign_data( pointer_type arg_data )
{
m_track.clear();
m_map.assign_data( arg_data );
}
// Wrap memory according to properties and array layout
template< class ... P >
explicit KOKKOS_INLINE_FUNCTION
@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
}
// Create a mirror view and deep_copy in a new space (specialization for same space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
(void)name;
return src;
}
// Create a mirror view and deep_copy in a new space (specialization for different space)
template<class Space, class T, class ... P>
typename Impl::MirrorViewType<Space,T,P ...>::view_type
create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
, std::string const& name = ""
, typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
std::string label = name.empty() ? src.label() : name;
auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
deep_copy(mirror, src);
return mirror;
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType >
using scalar_array_type = ValueType;
template < class ... Views >
KOKKOS_INLINE_FUNCTION
CommonViewAllocProp( const Views & ... ) {}
};
@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp<Views...
// User function
template < class ... Views >
KOKKOS_INLINE_FUNCTION
DeducedCommonPropsType<Views...>
common_view_alloc_prop( Views const & ... views )
{

View File

@ -46,15 +46,13 @@
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec;
}}} // namespace Kokkos::Impl::Experimental
}} // namespace Kokkos::Impl
namespace Kokkos {
namespace Experimental {
template< class ... Properties >
class WorkGraphPolicy
@ -64,187 +62,182 @@ public:
using self_type = WorkGraphPolicy<Properties ... >;
using traits = Kokkos::Impl::PolicyTraits<Properties ... >;
using index_type = typename traits::index_type;
using execution_space = typename traits::execution_space;
using work_tag = typename traits::work_tag;
using memory_space = typename execution_space::memory_space;
using graph_type = Kokkos::Experimental::Crs<index_type, execution_space, void, index_type>;
using member_type = index_type;
using work_tag = typename traits::work_tag;
using execution_space = typename traits::execution_space;
using memory_space = typename execution_space::memory_space;
using graph_type = Kokkos::Crs<index_type,execution_space,void,index_type>;
enum : std::int32_t {
END_TOKEN = -1 ,
BEGIN_TOKEN = -2 ,
COMPLETED_TOKEN = -3 };
private:
graph_type m_graph;
using ints_type = Kokkos::View<std::int32_t*, memory_space>;
using range_type = Kokkos::pair<std::int32_t, std::int32_t>;
using ranges_type = Kokkos::View<range_type*, memory_space>;
const std::int32_t m_total_work;
ints_type m_counts;
// Let N = m_graph.numRows(), the total work
// m_queue[ 0 .. N-1] = the ready queue
// m_queue[ N .. 2*N-1] = the waiting queue counts
// m_queue[2*N .. 2*N+2] = the ready queue hints
graph_type const m_graph;
ints_type m_queue ;
ranges_type m_ranges;
public:
struct TagZeroRanges {};
KOKKOS_INLINE_FUNCTION
void operator()(TagZeroRanges, std::int32_t i) const {
m_ranges[i] = range_type(0, 0);
}
void zero_ranges() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagZeroRanges>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, 1));
closure.execute();
execution_space::fence();
}
struct TagFillQueue {};
KOKKOS_INLINE_FUNCTION
void operator()(TagFillQueue, std::int32_t i) const {
if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i);
}
void fill_queue() {
using policy_type = RangePolicy<std::int32_t, execution_space, TagFillQueue>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, m_total_work));
closure.execute();
execution_space::fence();
}
private:
inline
void setup() {
if (m_graph.numRows() > std::numeric_limits<std::int32_t>::max()) {
Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t");
}
get_crs_transpose_counts(m_counts, m_graph);
m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work);
deep_copy(m_queue, std::int32_t(-1));
m_ranges = ranges_type("ranges", 1);
fill_queue();
}
KOKKOS_INLINE_FUNCTION
std::int32_t pop_work() const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first + 1 , w.second );
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
if ( w.first < w.second ) { // there was work in the queue
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
// we got a work item
std::int32_t i;
// the push_work function may have incremented the end counter
// but not yet written the work index into the queue.
// wait until the entry is valid.
while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) );
return i;
} // we got a work item
} else { // there was no work in the queue
#ifdef KOKKOS_DEBUG
if ( w_new.first == w.first + 1 && w_new.second == w.second ) {
Kokkos::abort("bug in pop_work");
}
#endif
if (w.first == m_total_work) { // all work is done
return -1;
} else { // need to wait for more work to be pushed
// take a guess that one work item will be pushed
// the key thing is we can't leave (w) alone, because
// otherwise the next compare_exchange may succeed in
// popping work from an empty queue
w.second++;
}
} // there was no work in the queue
} // while (true)
void push_work( const std::int32_t w ) const noexcept
{
const std::int32_t N = m_graph.numRows();
std::int32_t volatile * const ready_queue = & m_queue[0] ;
std::int32_t volatile * const end_hint = & m_queue[2*N+1] ;
// Push work to end of queue
const std::int32_t j = atomic_fetch_add( end_hint , 1 );
if ( ( N <= j ) ||
( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) {
// ERROR: past the end of queue or did not replace END_TOKEN
Kokkos::abort("WorkGraphPolicy push_work error");
}
KOKKOS_INLINE_FUNCTION
void push_work(std::int32_t i) const {
range_type w(-1,-1);
while (true) {
const range_type w_new( w.first , w.second + 1 );
// try to increment the end counter
w = atomic_compare_exchange( &m_ranges(0) , w , w_new );
// stop trying if the increment was successful
if ( w.first == w_new.first && w.second + 1 == w_new.second ) break;
}
// write the work index into the claimed spot in the queue
*((volatile std::int32_t*)(&m_queue( w.second ))) = i;
// push this write out into the memory system
memory_fence();
}
template< class functor_type , class execution_space, class ... policy_args >
friend class Kokkos::Impl::Experimental::WorkGraphExec;
public:
WorkGraphPolicy(graph_type arg_graph)
/**\brief Attempt to pop the work item at the head of the queue.
*
* Find entry 'i' such that
* ( m_queue[i] != BEGIN_TOKEN ) AND
* ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN )
* if found then
* increment begin hint
* return atomic_exchange( m_queue[i] , BEGIN_TOKEN )
* else if i < total work
* return END_TOKEN
* else
* return COMPLETED_TOKEN
*
*/
KOKKOS_INLINE_FUNCTION
std::int32_t pop_work() const noexcept
{
const std::int32_t N = m_graph.numRows();
std::int32_t volatile * const ready_queue = & m_queue[0] ;
std::int32_t volatile * const begin_hint = & m_queue[2*N] ;
// begin hint is guaranteed to be less than or equal to
// actual begin location in the queue.
for ( std::int32_t i = *begin_hint ; i < N ; ++i ) {
const std::int32_t w = ready_queue[i] ;
if ( w == END_TOKEN ) { return END_TOKEN ; }
if ( ( w != BEGIN_TOKEN ) &&
( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) {
// Attempt to claim ready work index succeeded,
// update the hint and return work index
atomic_increment( begin_hint );
return w ;
}
// arrive here when ready_queue[i] == BEGIN_TOKEN
}
return COMPLETED_TOKEN ;
}
KOKKOS_INLINE_FUNCTION
void completed_work( std::int32_t w ) const noexcept
{
Kokkos::memory_fence();
// Make sure the completed work function's memory accesses are flushed.
const std::int32_t N = m_graph.numRows();
std::int32_t volatile * const count_queue = & m_queue[N] ;
const std::int32_t B = m_graph.row_map(w);
const std::int32_t E = m_graph.row_map(w+1);
for ( std::int32_t i = B ; i < E ; ++i ) {
const std::int32_t j = m_graph.entries(i);
if ( 1 == atomic_fetch_add(count_queue+j,-1) ) {
push_work(j);
}
}
}
struct TagInit {};
struct TagCount {};
struct TagReady {};
/**\brief Initialize queue
*
* m_queue[0..N-1] = END_TOKEN, the ready queue
* m_queue[N..2*N-1] = 0, the waiting count queue
* m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue
*/
KOKKOS_INLINE_FUNCTION
void operator()( const TagInit , int i ) const noexcept
{ m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; }
KOKKOS_INLINE_FUNCTION
void operator()( const TagCount , int i ) const noexcept
{
std::int32_t volatile * const count_queue =
& m_queue[ m_graph.numRows() ] ;
atomic_increment( count_queue + m_graph.entries[i] );
}
KOKKOS_INLINE_FUNCTION
void operator()( const TagReady , int w ) const noexcept
{
std::int32_t const * const count_queue =
& m_queue[ m_graph.numRows() ] ;
if ( 0 == count_queue[w] ) push_work(w);
}
WorkGraphPolicy( const graph_type & arg_graph )
: m_graph(arg_graph)
, m_total_work( arg_graph.numRows() )
, m_queue( view_alloc( "queue" , WithoutInitializing )
, arg_graph.numRows() * 2 + 2 )
{
setup();
{ // Initialize
using policy_type = RangePolicy<std::int32_t, execution_space, TagInit>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this, policy_type(0, m_queue.size()));
closure.execute();
execution_space::fence();
}
};
}} // namespace Kokkos::Experimental
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
namespace Experimental {
template< class functor_type , class execution_space, class ... policy_args >
class WorkGraphExec
{
public:
using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >;
using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >;
using member_type = typename policy_type::member_type;
using memory_space = typename execution_space::memory_space;
protected:
const functor_type m_functor;
const policy_type m_policy;
protected:
KOKKOS_INLINE_FUNCTION
std::int32_t before_work() const {
return m_policy.pop_work();
{ // execute-after counts
using policy_type = RangePolicy<std::int32_t, execution_space, TagCount>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this,policy_type(0,m_graph.entries.size()));
closure.execute();
execution_space::fence();
}
KOKKOS_INLINE_FUNCTION
void after_work(std::int32_t i) const {
/* fence any writes that were done by the work item itself
(usually writing its result to global memory) */
memory_fence();
const std::int32_t begin = m_policy.m_graph.row_map( i );
const std::int32_t end = m_policy.m_graph.row_map( i + 1 );
for (std::int32_t j = begin; j < end; ++j) {
const std::int32_t next = m_policy.m_graph.entries( j );
const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 );
if ( old_count == 1 ) m_policy.push_work( next );
{ // Scheduling ready tasks
using policy_type = RangePolicy<std::int32_t, execution_space, TagReady>;
using closure_type = Kokkos::Impl::ParallelFor<self_type, policy_type>;
const closure_type closure(*this,policy_type(0,m_graph.numRows()));
closure.execute();
execution_space::fence();
}
}
inline
WorkGraphExec( const functor_type & arg_functor
, const policy_type & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{
}
};
}}} // namespace Kokkos::Impl::Experimental
} // namespace Kokkos
#ifdef KOKKOS_ENABLE_SERIAL
#include "impl/Kokkos_Serial_WorkGraphPolicy.hpp"

View File

@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count )
}
{
if (nullptr == std::getenv("OMP_PROC_BIND") ) {
if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) {
printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n");
printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n");
printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n");
@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count )
omp_set_num_threads(Impl::g_openmp_hardware_max_threads);
}
else {
if( thread_count > process_num_threads ) {
if( Kokkos::show_warnings() && thread_count > process_num_threads ) {
printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n");
printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count );
}
@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count )
// Check for over-subscription
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
// std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
//}
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();

View File

@ -170,20 +170,20 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
OpenMPExec * m_instance ;
const FunctorType m_functor ;
@ -445,14 +445,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ...>
, Kokkos::MDRangePolicy< Traits ...>
, ReducerType
, Kokkos::OpenMP
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -472,7 +472,7 @@ private:
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -49,33 +49,26 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::OpenMP
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::OpenMP,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::OpenMP, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{} ; m_functor( t , w ); }
public:
@ -86,9 +79,15 @@ public:
#pragma omp parallel num_threads(pool_size)
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
}
@ -96,12 +95,13 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */

View File

@ -125,7 +125,7 @@ namespace Kokkos {
oldval.t = *dest ;
assume.i = oldval.i ;
newval.t = val ;
atomic_compare_exchange( reinterpret_cast<int*>(dest) , assume.i, newval.i );
atomic_compare_exchange( (int*)(dest) , assume.i, newval.i );
return oldval.t ;
}

View File

@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
void ROCmInternal::finalize()
{
Kokkos::Impl::rocm_device_synchronize();
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {

View File

@ -277,7 +277,7 @@ public:
this->team_barrier();
value = local_value;
}
// Reduce accross a team of threads.
// Reduce across a team of threads.
//
// Each thread has vector_length elements.
// This reduction is for TeamThreadRange operations, where the range
@ -354,6 +354,80 @@ public:
return buffer[0];
}
// Reduce across a team of threads, with a reducer data type
//
// Each thread has vector_length elements.
// This reduction is for TeamThreadRange operations, where the range
// is spread across threads. Effectively, there are vector_length
// independent reduction operations.
// This is different from a reduction across the elements of a thread,
// which reduces every vector element.
template< class ReducerType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< is_reducer< ReducerType >::value >::type
team_reduce( const ReducerType & reducer) const
{
typedef typename ReducerType::value_type value_type ;
tile_static value_type buffer[512];
const auto local = lindex();
const auto team = team_rank();
auto vector_rank = local%m_vector_length;
auto thread_base = team*m_vector_length;
const std::size_t size = next_pow_2(m_team_size+1)/2;
#if defined(ROCM15)
buffer[local] = reducer.reference();
#else
// ROCM 1.5 handles address spaces better, previous version didn't
lds_for(buffer[local], [&](ValueType& x)
{
x = value;
});
#endif
m_idx.barrier.wait();
for(std::size_t s = 1; s < size; s *= 2)
{
const std::size_t index = 2 * s * team;
if (index < size)
{
#if defined(ROCM15)
reducer.join(buffer[vector_rank+index*m_vector_length],
buffer[vector_rank+(index+s)*m_vector_length]);
#else
lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
{
lds_for(buffer[vector_rank+(index+s)*m_vector_length],
[&](ValueType& y)
{
reducer.join(x, y);
});
});
#endif
}
m_idx.barrier.wait();
}
if (local == 0)
{
for(int i=size*m_vector_length; i<m_team_size*m_vector_length; i+=m_vector_length)
#if defined(ROCM15)
reducer.join(buffer[vector_rank], buffer[vector_rank+i]);
#else
lds_for(buffer[vector_rank], [&](ValueType& x)
{
lds_for(buffer[vector_rank+i],
[&](ValueType& y)
{
reducer.join(x, y);
});
});
#endif
}
m_idx.barrier.wait();
}
/** \brief Intra-team vector reduce
* with intra-team non-deterministic ordering accumulation.
@ -406,6 +480,33 @@ public:
return buffer[thread_base];
}
template< typename ReducerType >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< is_reducer< ReducerType >::value >::type
vector_reduce( ReducerType const & reducer )
{
#ifdef __HCC_ACCELERATOR__
if(blockDim_x == 1) return;
// Intra vector lane shuffle reduction:
typename ReducerType::value_type tmp ( reducer.reference() );
for ( int i = blockDim_x ; ( i >>= 1 ) ; ) {
shfl_down( reducer.reference() , i , blockDim_x );
if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); }
}
// Broadcast from root lane to all other lanes.
// Cannot use "butterfly" algorithm to avoid the broadcast
// because floating point summation is not associative
// and thus different threads could have different results.
shfl( reducer.reference() , 0 , blockDim_x );
#endif
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROC
// Impl::JoinAdd<ValueType>());
}
/** \brief Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ROCmTeamMember>& loop_boundaries,
const Lambda & lambda, ReducerType const & reducer) {
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
loop_boundaries.thread.team_reduce(reducer);
}
/** \brief Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::R
result = loop_boundaries.thread.thread_reduce(result,join);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
loop_boundaries, const Lambda & lambda, ReducerType const & reducer) {
reducer.init( reducer.reference() );
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
}
loop_boundaries.thread.vector_reduce(reducer);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ReducerType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ROCmTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,reducer.reference());
loop_boundaries.thread.team_barrier();
}
reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join);
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*

View File

@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
for ( int i = 0 ; i < n ; ++i ) {
Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
exec.m_pool_state = ThreadsExec::Inactive ;
@ -310,7 +310,7 @@ void ThreadsExec::fence()
{
if ( s_thread_pool_size[0] ) {
// Wait for the root thread to complete:
Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
}
s_current_function = 0 ;
@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count ,
}
// Check for over-subscription
//if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) {
// std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
// std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
// std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
// std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
//}
if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) {
std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl;
std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl;
std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl;
std::cout << " Requested: " << thread_count << " threads per process." << std::endl;
}
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();

View File

@ -50,6 +50,7 @@
#include <cstdio>
#include <utility>
#include <cstdalign>
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
@ -107,7 +108,7 @@ private:
// Which thread am I stealing from currently
int m_current_steal_target;
// This thread's owned work_range
Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
Kokkos::pair<long,long> m_work_range __attribute__((aligned(16))) ;
// Team Offset if one thread determines work_range for others
long m_team_work_index;
@ -191,13 +192,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -233,13 +234,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -268,7 +269,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , reduce_memory() , fan.reduce_memory() );
}
@ -295,7 +296,7 @@ public:
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
}
}
@ -327,7 +328,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: Active -> ReductionAvailable (or ScanAvailable)
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , work_value , fan.reduce_memory() );
}
@ -345,8 +346,8 @@ public:
// Wait: Active -> ReductionAvailable
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( th.m_pool_state , ThreadsExec::ReductionAvailable );
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
}
@ -357,7 +358,7 @@ public:
// Wait for all threads to complete inclusive scan
// Wait: ScanAvailable -> Rendezvous
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanAvailable );
}
//--------------------------------
@ -365,7 +366,7 @@ public:
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal<int>( fan.m_pool_state , ThreadsExec::ReductionAvailable );
// Set: ScanAvailable -> Rendezvous
fan.m_pool_state = ThreadsExec::Rendezvous ;
}
@ -392,13 +393,13 @@ public:
// Wait for all threads to copy previous thread's inclusive scan value
// Wait for all threads: Rendezvous -> ScanCompleted
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
}
if ( rev_rank ) {
// Set: ScanAvailable -> ScanCompleted
m_pool_state = ThreadsExec::ScanCompleted ;
// Wait: ScanCompleted -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::ScanCompleted );
}
// Set: ScanCompleted -> Active
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -425,7 +426,7 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -433,7 +434,7 @@ public:
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the thread-scan before releasing threads

View File

@ -107,13 +107,13 @@ public:
// Wait for fan-in threads
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
Impl::spinwait_while_equal<int>( m_team_base[j]->state() , ThreadsExec::Active );
}
// If not root then wait for release
if ( m_team_rank_rev ) {
m_exec->state() = ThreadsExec::Rendezvous ;
Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal<int>( m_exec->state() , ThreadsExec::Rendezvous );
}
return ! m_team_rank_rev ;

View File

@ -180,12 +180,12 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -193,7 +193,7 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type;
const FunctorType m_functor ;
const MDRangePolicy m_mdr_policy ;
@ -548,14 +548,14 @@ public:
// MDRangePolicy impl
template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::Experimental::MDRangePolicy< Traits ... >
, Kokkos::MDRangePolicy< Traits ... >
, ReducerType
, Kokkos::Threads
>
{
private:
typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ;
typedef typename MDRangePolicy::impl_range_policy Policy ;
typedef typename MDRangePolicy::work_tag WorkTag ;
@ -573,7 +573,7 @@ private:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy
, FunctorType
, WorkTag
, ValueType

View File

@ -49,45 +49,47 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::Threads
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Threads,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Threads, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
typedef ParallelFor<FunctorType,
Kokkos::Experimental::WorkGraphPolicy<Traits ...>,
Kokkos::WorkGraphPolicy<Traits ...>,
Kokkos::Threads> Self ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{}; m_functor( t , w ); }
inline void exec_one_thread() const {
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
inline void exec_one_thread() const noexcept
{
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
static inline void thread_main( ThreadsExec&, const void* arg ) {
static inline void thread_main( ThreadsExec&, const void* arg ) noexcept
{
const Self& self = *(static_cast<const Self*>(arg));
self.exec_one_thread();
}
@ -104,9 +106,9 @@ public:
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -59,7 +59,7 @@
#include <algorithm>
#include <cstdio>
namespace Kokkos { namespace Experimental { namespace Impl {
namespace Kokkos { namespace Impl {
// Temporary, for testing new loop macros
#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1
@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i
template <typename T>
using is_void = std::is_same< T , void >;
using is_void_type = std::is_same< T , void >;
template <typename T>
struct is_type_array : std::false_type
@ -1303,7 +1303,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void<ValueType >::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type<ValueType >::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -1781,7 +1781,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && !is_type_array<ValueType>::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && !is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -2268,7 +2268,7 @@ template < typename RP
, typename Tag
, typename ValueType
>
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void<ValueType >::value && is_type_array<ValueType>::value >::type >
struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type<ValueType >::value && is_type_array<ValueType>::value >::type >
{
using index_type = typename RP::index_type;
using point_type = typename RP::point_type;
@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i
// Cuda uses DeviceIterateTile directly within md_parallel_for
// TODO Once md_parallel_{for,reduce} removed, this can be removed
namespace Experimental {
// ParallelReduce - scalar reductions
template < typename MDRange, typename Functor, typename ValueType = void >
struct MDFunctor
@ -2759,7 +2761,7 @@ struct MDFunctor
using value_type = ValueType;
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
@ -2804,7 +2806,7 @@ struct MDFunctor< MDRange, Functor, ValueType[] >
using value_type = ValueType[];
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, value_type
@ -2852,7 +2854,7 @@ struct MDFunctor< MDRange, Functor, void >
using functor_type = Functor;
using work_tag = typename range_policy::work_tag;
using index_type = typename range_policy::index_type;
using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange
using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange
, Functor
, work_tag
, void
@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void >
Functor m_func;
};
} // end namespace Experimental
#undef KOKKOS_ENABLE_NEW_LOOP_MACROS
} } } //end namespace Kokkos::Experimental::Impl
} } //end namespace Kokkos::Impl
#endif

View File

@ -51,9 +51,12 @@
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
bool g_is_initialized = false;
bool g_show_warnings = true;
}
namespace Kokkos { namespace Impl { namespace {
bool is_unsigned_int(const char* str)
{
@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args)
setenv("MEMKIND_HBW_NODES", "1", 0);
#endif
if (args.disable_warnings) {
g_show_warnings = false;
}
// Protect declarations, to prevent "unused variable" warnings.
#if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET )
const int num_threads = args.num_threads;
@ -177,6 +184,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
g_is_initialized = true;
}
void finalize_internal( const bool all_spaces = false )
@ -233,6 +241,9 @@ void finalize_internal( const bool all_spaces = false )
Kokkos::Serial::finalize();
}
#endif
g_is_initialized = false;
g_show_warnings = true;
}
void fence_internal()
@ -306,9 +317,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) {
return true;
}
} // namespace
} // namespace Impl
} // namespace Kokkos
}}} // namespace Kokkos::Impl::{unnamed}
//----------------------------------------------------------------------------
@ -319,6 +328,7 @@ void initialize(int& narg, char* arg[])
int num_threads = -1;
int numa = -1;
int device = -1;
bool disable_warnings = false;
int kokkos_threads_found = 0;
int kokkos_numa_found = 0;
@ -415,6 +425,12 @@ void initialize(int& narg, char* arg[])
} else {
iarg++;
}
} else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) {
disable_warnings = true;
for(int k=iarg;k<narg-1;k++) {
arg[k] = arg[k+1];
}
narg--;
} else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
std::cout << std::endl;
std::cout << "--------------------------------------------------------------------------------" << std::endl;
@ -427,6 +443,7 @@ void initialize(int& narg, char* arg[])
std::cout << "settings." << std::endl;
std::cout << std::endl;
std::cout << "--kokkos-help : print this message" << std::endl;
std::cout << "--kokkos-disable-warnings : disable kokkos warning messages" << std::endl;
std::cout << "--kokkos-threads=INT : specify total number of threads or" << std::endl;
std::cout << " number of threads per NUMA region if " << std::endl;
std::cout << " used in conjunction with '--numa' option. " << std::endl;
@ -457,7 +474,7 @@ void initialize(int& narg, char* arg[])
iarg++;
}
InitArguments arguments{num_threads, numa, device};
InitArguments arguments{num_threads, numa, device, disable_warnings};
Impl::initialize_internal(arguments);
}
@ -787,5 +804,9 @@ void print_configuration( std::ostream & out , const bool detail )
out << msg.str() << std::endl;
}
bool is_initialized() noexcept { return g_is_initialized; }
bool show_warnings() noexcept { return g_show_warnings; }
} // namespace Kokkos

View File

@ -476,7 +476,7 @@ template< class FunctorType , class ArgTag , class T , class Enable >
struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
{
KOKKOS_FORCEINLINE_FUNCTION static
T & init( const FunctorType & f , void * p )
T & init( const FunctorType & , void * p )
{ return *( new(p) T() ); };
};

View File

@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const
}
void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
void HostSpace::deallocate( void * const arg_alloc_ptr
, const size_t
#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS )
arg_alloc_size
#endif
) const
{
if ( arg_alloc_ptr ) {
@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr
// Iterate records to print orphaned memory ...
void SharedAllocationRecord< Kokkos::HostSpace , void >::
print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail )
{
SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
}

View File

@ -44,6 +44,9 @@
#include <Kokkos_Macros.hpp>
#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP )
#define KOKKOS_MEMORY_FENCE_HPP
#include <atomic>
namespace Kokkos {
//----------------------------------------------------------------------------
@ -53,23 +56,8 @@ void memory_fence()
{
#if defined( __CUDA_ARCH__ )
__threadfence();
#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS )
amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"mfence" ::: "memory"
);
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
__sync_synchronize();
#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS )
_mm_mfence();
#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS )
#pragma omp flush
#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS )
MemoryBarrier();
#else
#error "Error: memory_fence() not defined"
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}
@ -81,12 +69,10 @@ void memory_fence()
KOKKOS_FORCEINLINE_FUNCTION
void store_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"sfence" ::: "memory"
);
#if defined( __CUDA_ARCH__ )
__threadfence();
#else
memory_fence();
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}
@ -98,12 +84,10 @@ void store_fence()
KOKKOS_FORCEINLINE_FUNCTION
void load_fence()
{
#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"lfence" ::: "memory"
);
#if defined( __CUDA_ARCH__ )
__threadfence();
#else
memory_fence();
std::atomic_thread_fence( std::memory_order_seq_cst );
#endif
}

View File

@ -49,51 +49,50 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType ,
Kokkos::Experimental::WorkGraphPolicy< Traits ... > ,
Kokkos::WorkGraphPolicy< Traits ... > ,
Kokkos::Serial
>
: public Kokkos::Impl::Experimental::
WorkGraphExec< FunctorType,
Kokkos::Serial,
Traits ...
>
{
private:
typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ;
typedef Kokkos::Impl::Experimental::
WorkGraphExec<FunctorType, Kokkos::Serial, Traits ... > Base ;
typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ;
Policy m_policy ;
FunctorType m_functor ;
template< class TagType >
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
Base::m_functor( i );
}
exec_one( const std::int32_t w ) const noexcept
{ m_functor( w ); }
template< class TagType >
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_one(const typename Policy::member_type& i) const {
const TagType t{} ;
Base::m_functor( t , i );
}
exec_one( const std::int32_t w ) const noexcept
{ const TagType t{}; m_functor( t , w ); }
public:
inline
void execute()
void execute() const noexcept
{
for (std::int32_t i; (-1 != (i = Base::before_work())); ) {
exec_one< typename Policy::work_tag >( i );
Base::after_work(i);
// Spin until COMPLETED_TOKEN.
// END_TOKEN indicates no work is currently available.
for ( std::int32_t w = Policy::END_TOKEN ;
Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) {
if ( Policy::END_TOKEN != w ) {
exec_one< typename Policy::work_tag >( w );
m_policy.completed_work(w);
}
}
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: Base( arg_functor, arg_policy )
{
}
: m_policy( arg_policy )
, m_functor( arg_functor )
{}
};
} // namespace Impl

View File

@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, r->m_alloc_ptr->m_label
);
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != root );
}
@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s
else {
snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
}
std::cout << buffer ;
s << buffer ;
r = r->m_next ;
} while ( r != root );
}

View File

@ -294,9 +294,13 @@ public:
template< class MemorySpace >
constexpr
SharedAllocationRecord< MemorySpace , void > &
get_record() const
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
SharedAllocationRecord< MemorySpace , void > *
get_record() const noexcept
{
return ( m_record_bits & DO_NOT_DEREF_FLAG )
? (SharedAllocationRecord< MemorySpace,void>*) 0
: static_cast<SharedAllocationRecord<MemorySpace,void>*>(m_record);
}
template< class MemorySpace >
std::string get_label() const
@ -323,6 +327,16 @@ public:
return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0;
}
KOKKOS_FORCEINLINE_FUNCTION
void clear()
{
// If this is tracking then must decrement
KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT
// Reset to default constructed value.
m_record_bits = DO_NOT_DEREF_FLAG ;
}
// Copy:
KOKKOS_FORCEINLINE_FUNCTION
~SharedAllocationTracker()
{ KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT }

View File

@ -48,7 +48,7 @@
#include <impl/Kokkos_Spinwait.hpp>
#include <impl/Kokkos_BitOps.hpp>
#if defined( KOKKOS_ENABLE_STDTHREAD )
#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 )
#include <thread>
#elif !defined( _WIN32 )
#include <sched.h>
@ -63,9 +63,8 @@
namespace Kokkos {
namespace Impl {
namespace {
void host_thread_yield( const uint32_t i , const int force_yield )
void host_thread_yield( const uint32_t i , const WaitMode mode )
{
static constexpr uint32_t sleep_limit = 1 << 13 ;
static constexpr uint32_t yield_limit = 1 << 12 ;
@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield )
// Attempt to put the thread to sleep for 'c' milliseconds
#if defined( KOKKOS_ENABLE_STDTHREAD )
std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) )
#elif !defined( _WIN32 )
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
auto start = std::chrono::high_resolution_clock::now();
std::this_thread::yield();
std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) );
#else
timespec req ;
req.tv_sec = 0 ;
req.tv_nsec = 1000 * c ;
nanosleep( &req, nullptr );
#else /* defined( _WIN32 ) IS Microsoft Windows */
Sleep(c);
#endif
}
else if ( force_yield || yield_limit < i ) {
else if ( mode == WaitMode::PASSIVE || yield_limit < i ) {
// Attempt to yield thread resources to runtime
#if defined( KOKKOS_ENABLE_STDTHREAD )
#if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 )
std::this_thread::yield();
#elif !defined( _WIN32 )
#else
sched_yield();
#else /* defined( _WIN32 ) IS Microsoft Windows */
YieldProcessor();
#endif
}
@ -138,71 +135,7 @@ void host_thread_yield( const uint32_t i , const int force_yield )
#endif /* defined( KOKKOS_ENABLE_ASM ) */
}
}}} // namespace Kokkos::Impl::{anonymous}
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0);
Kokkos::load_fence();
}
void yield_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
void yield_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1);
Kokkos::load_fence();
}
} /* namespace Impl */
} /* namespace Kokkos */
}} // namespace Kokkos::Impl
#else
void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {}

View File

@ -46,47 +46,95 @@
#define KOKKOS_SPINWAIT_HPP
#include <Kokkos_Macros.hpp>
#include <Kokkos_Atomic.hpp>
#include <cstdint>
#include <type_traits>
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
enum class WaitMode : int {
ACTIVE // Used for tight loops to keep threads active longest
, PASSIVE // Used to quickly yield the thread to quite down the system
};
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
void yield_while_equal( volatile int32_t & flag , const int32_t value );
void yield_until_equal( volatile int32_t & flag , const int32_t value );
void host_thread_yield( const uint32_t i , const WaitMode mode );
void yield_while_equal( volatile int64_t & flag , const int64_t value );
void yield_until_equal( volatile int64_t & flag , const int64_t value );
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_while_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value == flag ) {
host_thread_yield(++i, WaitMode::ACTIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_while_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value == flag ) {
host_thread_yield(++i, WaitMode::PASSIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_until_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value != flag ) {
host_thread_yield(++i, WaitMode::ACTIVE);
}
Kokkos::load_fence();
}
template <typename T>
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_until_equal( T const volatile & flag, const T value )
{
Kokkos::store_fence();
uint32_t i = 0 ;
while( value != flag ) {
host_thread_yield(++i, WaitMode::PASSIVE);
}
Kokkos::load_fence();
}
#else
template <typename T>
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_while_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_while_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int32_t & , const int32_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
spinwait_until_equal( T const volatile & flag, const T value ) {}
template <typename T>
KOKKOS_INLINE_FUNCTION
void yield_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void yield_until_equal( volatile int64_t & , const int64_t ) {}
typename std::enable_if< std::is_integral<T>::value, void>::type
yield_until_equal( T const volatile & flag, const T value ) {}
#endif

View File

@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp<Specialize,T> >
using type = CommonViewAllocProp<Specialize,T> ;
KOKKOS_INLINE_FUNCTION
ViewCtorProp( const type & arg ) : value( arg ) {}
KOKKOS_INLINE_FUNCTION
ViewCtorProp( type && arg ) : value( arg ) {}
type value ;
@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant<unsigned,I> >
ViewCtorProp & operator = ( const ViewCtorProp & ) = default ;
template< typename P >
KOKKOS_INLINE_FUNCTION
ViewCtorProp( const P & ) {}
};

View File

@ -369,9 +369,9 @@ private:
template< size_t ... DimArgs >
KOKKOS_FORCEINLINE_FUNCTION
bool set( unsigned domain_rank
, unsigned range_rank
, const ViewDimension< DimArgs ... > & dim )
bool set( unsigned
, unsigned
, const ViewDimension< DimArgs ... > & )
{ return true ; }
template< class T , size_t ... DimArgs , class ... Args >
@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft
template< class DimRHS >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset(
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs ,
const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & ,
const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub )
: m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 )
{
@ -1252,7 +1252,7 @@ public:
template< unsigned TrivialScalarSize >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
( std::integral_constant<unsigned,TrivialScalarSize> const &
, Kokkos::LayoutLeft const & arg_layout
)
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -1741,7 +1741,7 @@ public:
template< unsigned TrivialScalarSize >
KOKKOS_INLINE_FUNCTION
constexpr ViewOffset
( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
( std::integral_constant<unsigned,TrivialScalarSize> const &
, Kokkos::LayoutRight const & arg_layout
)
: m_dim( arg_layout.dimension[0] , arg_layout.dimension[1]
@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits ,
)>::type >
{
typedef typename Traits::value_type value_type ;
typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
typedef typename Traits::value_type & return_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( value_type * arg_data_ptr
, track_type const & /*arg_tracker*/ )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr );
@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( handle_type const arg_data_ptr
, size_t offset )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr + offset );
@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits ,
)>::type >
{
typedef typename Traits::value_type value_type ;
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ;
typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ;
typedef typename Traits::value_type & return_type ;
typedef Kokkos::Impl::SharedAllocationTracker track_type ;
@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( value_type * arg_data_ptr
, track_type const & /*arg_tracker*/ )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr );
@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits ,
static handle_type assign( handle_type const arg_data_ptr
, size_t offset )
{
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) {
if ( reinterpret_cast<uintptr_t>(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) {
Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute");
}
return handle_type( arg_data_ptr + offset );
@ -2783,6 +2783,11 @@ public:
, m_offset( std::integral_constant< unsigned , 0 >() , arg_layout )
{}
/**\brief Assign data */
KOKKOS_INLINE_FUNCTION
void assign_data( pointer_type arg_ptr )
{ m_handle = handle_type( arg_ptr ); }
//----------------------------------------
/* Allocate and construct mapped array.
* Allocate via shared allocation record and

View File

@ -48,6 +48,7 @@
#include <algorithm>
#include <Kokkos_Macros.hpp>
#include <Kokkos_Core.hpp>
#include <Kokkos_hwloc.hpp>
#include <impl/Kokkos_Error.hpp>
@ -312,14 +313,18 @@ Sentinel::Sentinel()
hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS );
if ( hwloc_bitmap_iszero( s_process_binding ) ) {
if (Kokkos::show_warnings() ) {
std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl;
}
const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU );
int num_pu = 1;
if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) {
num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth );
}
else {
if (Kokkos::show_warnings() ) {
std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl;
}
num_pu = 1;
}
hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1);
@ -349,7 +354,7 @@ Sentinel::Sentinel()
hwloc_bitmap_free( s_process_no_core_zero );
if ( ! ok ) {
if ( Kokkos::show_warnings() && ! ok ) {
std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
}
}
@ -503,8 +508,8 @@ Sentinel::Sentinel()
hwloc_bitmap_free( proc_cpuset_location );
if ( ! symmetric ) {
std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
if ( Kokkos::show_warnings() && ! symmetric ) {
std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
<< std::endl ;
}
}

View File

@ -49,6 +49,7 @@ IF(Kokkos_ENABLE_Serial)
serial/TestSerial_SubView_c10.cpp
serial/TestSerial_SubView_c11.cpp
serial/TestSerial_SubView_c12.cpp
serial/TestSerial_SubView_c13.cpp
serial/TestSerial_Team.cpp
serial/TestSerial_TeamReductionScan.cpp
serial/TestSerial_TeamScratch.cpp
@ -96,6 +97,7 @@ IF(Kokkos_ENABLE_Pthread)
threads/TestThreads_SubView_c10.cpp
threads/TestThreads_SubView_c11.cpp
threads/TestThreads_SubView_c12.cpp
threads/TestThreads_SubView_c13.cpp
threads/TestThreads_Team.cpp
threads/TestThreads_TeamReductionScan.cpp
threads/TestThreads_TeamScratch.cpp
@ -143,6 +145,7 @@ IF(Kokkos_ENABLE_OpenMP)
openmp/TestOpenMP_SubView_c10.cpp
openmp/TestOpenMP_SubView_c11.cpp
openmp/TestOpenMP_SubView_c12.cpp
openmp/TestOpenMP_SubView_c13.cpp
openmp/TestOpenMP_Task.cpp
openmp/TestOpenMP_Team.cpp
openmp/TestOpenMP_TeamReductionScan.cpp
@ -184,6 +187,7 @@ IF(Kokkos_ENABLE_Qthreads)
qthreads/TestQthreads_SubView_c10.cpp
qthreads/TestQthreads_SubView_c11.cpp
qthreads/TestQthreads_SubView_c12.cpp
qthreads/TestQthreads_SubView_c13.cpp
qthreads/TestQthreads_Team.cpp
qthreads/TestQthreads_ViewAPI_a.cpp
qthreads/TestQthreads_ViewAPI_b.cpp
@ -235,6 +239,7 @@ IF(Kokkos_ENABLE_Cuda)
cuda/TestCuda_SubView_c10.cpp
cuda/TestCuda_SubView_c11.cpp
cuda/TestCuda_SubView_c12.cpp
cuda/TestCuda_SubView_c13.cpp
cuda/TestCuda_Task.cpp
cuda/TestCuda_Team.cpp
cuda/TestCuda_TeamReductionScan.cpp

View File

@ -56,6 +56,7 @@ else
OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o
OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o
OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o
OBJ_CUDA += TestCuda_SubView_c13.o
endif
OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o
OBJ_CUDA += TestCuda_Complex.o
@ -169,6 +170,7 @@ else
OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o
OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o
OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o
OBJ_OPENMP += TestOpenMP_SubView_c13.o
endif
OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o
OBJ_OPENMP += TestOpenMP_Complex.o
@ -258,6 +260,7 @@ else
OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o
OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o
OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o
OBJ_SERIAL += TestSerial_SubView_c13.o
endif
OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o
OBJ_SERIAL += TestSerial_Complex.o

View File

@ -467,6 +467,7 @@ TEST_F( TEST_CATEGORY, atomics )
ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 3 ) ) );
#ifndef KOKKOS_ENABLE_OPENMPTARGET
#ifndef KOKKOS_ENABLE_ROCM
ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 1 ) ) );
ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 2 ) ) );
ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex<double>, TEST_EXECSPACE >( 100, 3 ) ) );
@ -475,6 +476,7 @@ TEST_F( TEST_CATEGORY, atomics )
ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 2 ) ) );
ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 3 ) ) );
#endif
#endif
}

View File

@ -71,11 +71,13 @@ struct TestComplexConstruction {
ASSERT_FLOAT_EQ(h_results(7).real(),7.5); ASSERT_FLOAT_EQ(h_results(7).imag(),0.0);
ASSERT_FLOAT_EQ(h_results(8).real(),double(8)); ASSERT_FLOAT_EQ(h_results(8).imag(),0.0);
#ifndef KOKKOS_ENABLE_ROCM
Kokkos::complex<double> a(1.5,2.5),b(3.25,5.25),r_kk;
std::complex<double> sa(a),sb(3.25,5.25),r;
r = a; r_kk = a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
r = sb*a; r_kk = b*a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
r = sa; r_kk = a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag());
#endif
}

View File

@ -65,8 +65,8 @@ struct CountFillFunctor {
template< class ExecSpace >
void test_count_fill(std::int32_t nrows) {
Kokkos::Experimental::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
Kokkos::Experimental::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
Kokkos::Crs<std::int32_t, ExecSpace, void, std::int32_t> graph;
Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor<ExecSpace>());
ASSERT_EQ(graph.numRows(), nrows);
auto row_map = Kokkos::create_mirror_view(graph.row_map);
Kokkos::deep_copy(row_map, graph.row_map);

View File

@ -2489,6 +2489,303 @@ struct TestMDRange_6D {
}
};
template <typename ExecSpace >
struct TestMDRange_2D_NegIdx {
using value_type = double;
using DataType = int;
using ViewType = typename Kokkos::View< DataType**, ExecSpace >;
using HostViewType = typename ViewType::HostMirror;
ViewType input_view;
DataType lower_offset[2];
TestMDRange_2D_NegIdx( const DataType L0, const DataType L1, const DataType N0, const DataType N1 ) : input_view( "input_view", N0 - L0, N1 - L1 )
{
lower_offset[0] = L0;
lower_offset[1] = L1;
}
// When using negative indices, must offset View appropriately as views cannot take a negative index
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j ) const
{
input_view( i - lower_offset[0], j - lower_offset[1] ) = 1;
}
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, value_type &lsum ) const
{
lsum += input_view( i - lower_offset[0], j - lower_offset[1] ) * 2;
}
static void test_2D_negidx( const int N0, const int N1 )
{
using namespace Kokkos::Experimental;
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
const point_type lower{{-1, -1}};
const point_type upper{{N0, N1}};
const tile_type tile{{8,8}};
range_type range( point_type{{ lower[0], lower[1] }}, point_type{{ upper[0], upper[1] }}, tile_type{{ tile[0], tile[1] }} );
TestMDRange_2D_NegIdx functor( lower[0], lower[1], upper[0], upper[1] );
parallel_for( range, functor );
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) );
}
}
};
template <typename ExecSpace >
struct TestMDRange_3D_NegIdx {
using value_type = double;
using DataType = int;
using ViewType = typename Kokkos::View< DataType***, ExecSpace >;
using HostViewType = typename ViewType::HostMirror;
ViewType input_view;
DataType lower_offset[3];
TestMDRange_3D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2 )
{
lower_offset[0] = L0;
lower_offset[1] = L1;
lower_offset[2] = L2;
}
// When using negative indices, must offset View appropriately as views cannot take a negative index
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k ) const
{
input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) = 1;
}
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, value_type &lsum ) const
{
lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) * 2;
}
static void test_3D_negidx( const int N0, const int N1, const int N2 )
{
using namespace Kokkos::Experimental;
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
const point_type lower{{-1, -1, -1}};
const point_type upper{{N0, N1, N2}};
const tile_type tile{{8,8,2}};
range_type range( point_type{{ lower[0], lower[1], lower[2] }}, point_type{{ upper[0], upper[1], upper[2] }}, tile_type{{ tile[0], tile[1], tile[2] }} );
TestMDRange_3D_NegIdx functor( lower[0], lower[1], lower[2], upper[0], upper[1], upper[2] );
parallel_for( range, functor );
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) );
}
}
};
template <typename ExecSpace >
struct TestMDRange_4D_NegIdx {
using value_type = double;
using DataType = int;
using ViewType = typename Kokkos::View< DataType****, ExecSpace >;
using HostViewType = typename ViewType::HostMirror;
ViewType input_view;
DataType lower_offset[4];
TestMDRange_4D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3 )
{
lower_offset[0] = L0;
lower_offset[1] = L1;
lower_offset[2] = L2;
lower_offset[3] = L3;
}
// When using negative indices, must offset View appropriately as views cannot take a negative index
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l ) const
{
input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) = 1;
}
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l, value_type &lsum ) const
{
lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) * 2;
}
static void test_4D_negidx( const int N0, const int N1, const int N2, const int N3 )
{
using namespace Kokkos::Experimental;
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
const point_type lower{{-1, -1, -1, -1}};
const point_type upper{{N0, N1, N2, N3}};
const tile_type tile{{8,8,2,2}};
range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3] }}, point_type{{ upper[0], upper[1], upper[2], upper[3] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3] }} );
TestMDRange_4D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], upper[0], upper[1], upper[2], upper[3] );
parallel_for( range, functor );
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) );
}
}
};
template <typename ExecSpace >
struct TestMDRange_5D_NegIdx {
using value_type = double;
using DataType = int;
using ViewType = typename Kokkos::View< DataType*****, ExecSpace >;
using HostViewType = typename ViewType::HostMirror;
ViewType input_view;
DataType lower_offset[5];
TestMDRange_5D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4 )
{
lower_offset[0] = L0;
lower_offset[1] = L1;
lower_offset[2] = L2;
lower_offset[3] = L3;
lower_offset[4] = L4;
}
// When using negative indices, must offset View appropriately as views cannot take a negative index
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l, const int m ) const
{
input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) = 1;
}
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const
{
lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) * 2;
}
static void test_5D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4 )
{
using namespace Kokkos::Experimental;
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
const point_type lower{{-1, -1, -1, -1, -1}};
const point_type upper{{N0, N1, N2, N3, N4}};
const tile_type tile{{8,4,2,2,2}};
range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4] }} );
TestMDRange_5D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], upper[0], upper[1], upper[2], upper[3], upper[4] );
parallel_for( range, functor );
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) );
}
}
};
template <typename ExecSpace >
struct TestMDRange_6D_NegIdx {
using value_type = double;
using DataType = int;
using ViewType = typename Kokkos::View< DataType******, ExecSpace >;
using HostViewType = typename ViewType::HostMirror;
ViewType input_view;
DataType lower_offset[6];
TestMDRange_6D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType L5, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4, N5 - L5 )
{
lower_offset[0] = L0;
lower_offset[1] = L1;
lower_offset[2] = L2;
lower_offset[3] = L3;
lower_offset[4] = L4;
lower_offset[5] = L5;
}
// When using negative indices, must offset View appropriately as views cannot take a negative index
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const
{
input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) = 1;
}
KOKKOS_INLINE_FUNCTION
void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const
{
lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) * 2;
}
static void test_6D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 )
{
using namespace Kokkos::Experimental;
{
typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType<int> > range_type;
typedef typename range_type::tile_type tile_type;
typedef typename range_type::point_type point_type;
const point_type lower{{-1, -1, -1, -1, -1, -1}};
const point_type upper{{N0, N1, N2, N3, N4, N5}};
const tile_type tile{{8,4,2,2,2,1}};
range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4], lower[5] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4], tile[5] }} );
TestMDRange_6D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], lower[5], upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] );
parallel_for( range, functor );
double sum = 0.0;
parallel_reduce( range, functor, sum );
ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) * (upper[5] - lower[5]) );
}
}
};
} // namespace
TEST_F( TEST_CATEGORY , mdrange_for ) {
@ -2512,6 +2809,14 @@ TEST_F( TEST_CATEGORY , mdrange_array_reduce ) {
TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 );
TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 );
}
TEST_F( TEST_CATEGORY , mdrange_neg_idx ) {
TestMDRange_2D_NegIdx< TEST_EXECSPACE >::test_2D_negidx( 128, 32 );
TestMDRange_3D_NegIdx< TEST_EXECSPACE >::test_3D_negidx( 128, 32, 8 );
TestMDRange_4D_NegIdx< TEST_EXECSPACE >::test_4D_negidx( 128, 32, 8, 8 );
TestMDRange_5D_NegIdx< TEST_EXECSPACE >::test_5D_negidx( 128, 32, 8, 8, 4 );
TestMDRange_6D_NegIdx< TEST_EXECSPACE >::test_6D_negidx( 128, 32, 8, 8, 4, 2 );
}
//#endif
} // namespace Test

View File

@ -521,6 +521,101 @@ void test_memory_pool_corners( const bool print_statistics
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class DeviceType , class Enable = void >
struct TestMemoryPoolHuge
{
TestMemoryPoolHuge() {}
enum : size_t { num_superblock = 0 };
using value_type = long ;
KOKKOS_INLINE_FUNCTION
void operator()( int i , long & err ) const noexcept {}
KOKKOS_INLINE_FUNCTION
void operator()( int i ) const noexcept {}
};
template< class DeviceType >
struct TestMemoryPoolHuge< DeviceType
, typename std::enable_if<
std::is_same< Kokkos::HostSpace
, typename DeviceType::memory_space >
::value >::type
>
{
typedef Kokkos::View< uintptr_t * , DeviceType > ptrs_type ;
typedef Kokkos::MemoryPool< DeviceType > pool_type ;
typedef typename DeviceType::memory_space memory_space ;
pool_type pool ;
ptrs_type ptrs ;
enum : size_t { min_block_size = 512
, max_block_size = 1lu << 31
, min_superblock_size = max_block_size
, num_superblock = 4
, total_alloc_size = num_superblock * max_block_size };
TestMemoryPoolHuge()
: pool( memory_space()
, total_alloc_size
, min_block_size
, max_block_size
, min_superblock_size )
, ptrs( "ptrs" , num_superblock )
{}
// Specify reduction argument value_type to
// avoid confusion with tag-dispatch.
using value_type = long ;
void operator()( int i , long & err ) const noexcept
{
if ( i < int(num_superblock) ) {
ptrs(i) = (uintptr_t) pool.allocate( max_block_size );
#if 0
printf("TestMemoryPoolHuge size(0x%lx) ptr(0x%lx)\n"
, max_block_size
, ptrs(i) );
#endif
if ( ! ptrs(i) ) {
Kokkos::abort("TestMemoryPoolHuge");
++err ;
}
}
}
void operator()( int i ) const noexcept
{
if ( i < int(num_superblock) ) {
pool.deallocate( (void*) ptrs(i) , max_block_size );
ptrs(i) = 0 ;
}
}
};
template< class DeviceType >
void test_memory_pool_huge()
{
typedef typename DeviceType::execution_space execution_space ;
typedef TestMemoryPoolHuge< DeviceType > functor_type ;
typedef Kokkos::RangePolicy< execution_space > policy_type ;
functor_type f ;
policy_type policy( 0 , functor_type::num_superblock );
long err = 0 ;
Kokkos::parallel_reduce( policy , f , err );
Kokkos::parallel_for( policy , f );
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
} // namespace TestMemoryPool
namespace Test {
@ -531,6 +626,7 @@ TEST_F( TEST_CATEGORY, memory_pool )
TestMemoryPool::test_host_memory_pool_stats<>();
TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false);
TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false);
TestMemoryPool::test_memory_pool_huge< TEST_EXECSPACE >();
}
}

View File

@ -141,13 +141,13 @@ struct TestFib
enum { MinBlockSize = 64 };
enum { MaxBlockSize = 1024 };
enum { SuperBlockSize = 1u << 12 };
enum { SuperBlockSize = 4096 };
sched_type root_sched( memory_space()
, MemoryCapacity
, MinBlockSize
, MaxBlockSize
, SuperBlockSize );
, std::min(size_t(MaxBlockSize),MemoryCapacity)
, std::min(size_t(SuperBlockSize),MemoryCapacity) );
future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched )
, TestFib( root_sched, i ) );
@ -205,11 +205,10 @@ struct TestTaskSpawn {
{
typedef typename sched_type::memory_space memory_space;
// enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
enum { MemoryCapacity = 16000 };
enum { MinBlockSize = 64 };
enum { MaxBlockSize = 1024 };
enum { SuperBlockSize = 1u << 12 };
enum { SuperBlockSize = 4096 };
sched_type sched( memory_space()
, MemoryCapacity
@ -277,11 +276,10 @@ struct TestTaskDependence {
{
typedef typename sched_type::memory_space memory_space;
// enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool.
enum { MemoryCapacity = 16000 };
enum { MinBlockSize = 64 };
enum { MaxBlockSize = 1024 };
enum { SuperBlockSize = 1u << 12 };
enum { SuperBlockSize = 4096 };
sched_type sched( memory_space()
, MemoryCapacity
@ -471,13 +469,11 @@ struct TestTaskTeam {
static void run( long n )
{
//const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
//const unsigned memory_capacity = 100000; // Fails with SPAN=1 for serial and OMP.
const unsigned memory_capacity = 400000;
enum { MinBlockSize = 64 };
enum { MaxBlockSize = 1024 };
enum { SuperBlockSize = 1u << 12 };
enum { SuperBlockSize = 4096 };
sched_type root_sched( typename sched_type::memory_space()
, memory_capacity
@ -600,12 +596,11 @@ struct TestTaskTeamValue {
static void run( long n )
{
//const unsigned memory_capacity = 10000; // Causes memory pool infinite loop.
const unsigned memory_capacity = 100000;
enum { MinBlockSize = 64 };
enum { MaxBlockSize = 1024 };
enum { SuperBlockSize = 1u << 12 };
enum { SuperBlockSize = 4096 };
sched_type root_sched( typename sched_type::memory_space()
, memory_capacity
@ -655,7 +650,6 @@ TEST_F( TEST_CATEGORY, task_fib )
TEST_F( TEST_CATEGORY, task_depend )
{
for ( int i = 0; i < 25; ++i ) {
printf("\nTest::task_depend %d\n",i);
TestTaskScheduler::TestTaskDependence< TEST_EXECSPACE >::run( i );
}
}

View File

@ -786,11 +786,40 @@ struct TestViewMirror
ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
}
template< class MemoryTraits >
void static test_mirror_copy() {
Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 10 );
a_org(5) = 42.0;
Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org;
auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0;
int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0;
int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0;
int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0;
int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
ASSERT_EQ( equal_ptr_h_h2, 1 );
ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
ASSERT_EQ( a_h.dimension_0(), a_h3.dimension_0() );
ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() );
ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() );
ASSERT_EQ( a_org(5), a_h3(5) );
}
void static testit() {
test_mirror< Kokkos::MemoryTraits<0> >();
test_mirror< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
test_mirror_view< Kokkos::MemoryTraits<0> >();
test_mirror_view< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
test_mirror_copy< Kokkos::MemoryTraits<0> >();
test_mirror_copy< Kokkos::MemoryTraits<Kokkos::Unmanaged> >();
}
};
@ -1312,10 +1341,12 @@ return;
}
};
#if !defined(KOKKOS_ENABLE_ROCM)
TEST_F( TEST_CATEGORY, view_api )
{
TestViewAPI< double, TEST_EXECSPACE >();
}
#endif
TEST_F( TEST_CATEGORY, view_remap )
{

View File

@ -1294,5 +1294,41 @@ void test_layoutright_to_layoutright() {
}
}
//----------------------------------------------------------------------------
template< class Space >
struct TestUnmanagedSubviewReset
{
Kokkos::View<int****,Space> a ;
KOKKOS_INLINE_FUNCTION
void operator()( int ) const noexcept
{
auto sub_a = Kokkos::subview(a,0,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL);
for ( int i = 0 ; i < int(a.dimension(0)) ; ++i ) {
sub_a.assign_data( & a(i,0,0,0) );
if ( & sub_a(1,1,1) != & a(i,1,1,1) ) {
Kokkos::abort("TestUnmanagedSubviewReset");
}
}
}
TestUnmanagedSubviewReset()
: a( Kokkos::view_alloc() , 20 , 10 , 5 , 2 )
{}
};
template< class Space >
void test_unmanaged_subview_reset()
{
Kokkos::parallel_for
( Kokkos::RangePolicy< typename Space::execution_space >(0,1)
, TestUnmanagedSubviewReset<Space>()
);
}
} // namespace TestViewSubview
#endif

View File

@ -67,7 +67,7 @@ template< class ExecSpace >
struct TestWorkGraph {
using MemorySpace = typename ExecSpace::memory_space;
using Policy = Kokkos::Experimental::WorkGraphPolicy<std::int32_t, ExecSpace>;
using Policy = Kokkos::WorkGraphPolicy<std::int32_t, ExecSpace>;
using Graph = typename Policy::graph_type;
using RowMap = typename Graph::row_map_type;
using Entries = typename Graph::entries_type;
@ -117,6 +117,7 @@ struct TestWorkGraph {
m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more
m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent
m_values = Values("values", hg.size());
//printf("%zu work items\n", hg.size());
auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map);
auto h_entries = Kokkos::create_mirror_view(m_graph.entries);
auto h_values = Kokkos::create_mirror_view(m_values);
@ -156,17 +157,15 @@ struct TestWorkGraph {
} // anonymous namespace
TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib )
TEST_F( TEST_CATEGORY, workgraph_fib )
{
#ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND
int limit = 15;
#else
int limit = 27;
#endif
for ( int i = 0; i < limit; ++i) {
TestWorkGraph< TEST_EXECSPACE > f(i);
f.test_for();
}
//TestWorkGraph< TEST_EXECSPACE > f(2);
//f.test_for();
}
} // namespace Test

View File

@ -0,0 +1,54 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <cuda/TestCudaUVM_Category.hpp>
#include <TestViewSubview.hpp>
namespace Test {
TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
{
TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
}
} // namespace Test

View File

@ -10,3 +10,4 @@
#include <cuda/TestCuda_SubView_c10.cpp>
#include <cuda/TestCuda_SubView_c11.cpp>
#include <cuda/TestCuda_SubView_c12.cpp>
#include <cuda/TestCuda_SubView_c13.cpp>

View File

@ -0,0 +1,54 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <openmp/TestOpenMP_Category.hpp>
#include <TestViewSubview.hpp>
namespace Test {
TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
{
TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
}
} // namespace Test

View File

@ -10,3 +10,4 @@
#include <openmp/TestOpenMP_SubView_c10.cpp>
#include <openmp/TestOpenMP_SubView_c11.cpp>
#include <openmp/TestOpenMP_SubView_c12.cpp>
#include <openmp/TestOpenMP_SubView_c13.cpp>

View File

@ -0,0 +1,55 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <qthreads/TestQthreads.hpp>
namespace Test {
TEST_F( qthreads, view_test_unmanaged_subview_reset )
{
#if 0
TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
#endif
}
} // namespace Test

View File

@ -10,3 +10,4 @@
#include <qthreads/TestQthreads_SubView_c10.cpp>
#include <qthreads/TestQthreads_SubView_c11.cpp>
#include <qthreads/TestQthreads_SubView_c12.cpp>
#include <qthreads/TestQthreads_SubView_c13.cpp>

View File

@ -0,0 +1,54 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <serial/TestSerial_Category.hpp>
#include <TestViewSubview.hpp>
namespace Test {
TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
{
TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
}
} // namespace Test

View File

@ -10,3 +10,4 @@
#include <serial/TestSerial_SubView_c10.cpp>
#include <serial/TestSerial_SubView_c11.cpp>
#include <serial/TestSerial_SubView_c12.cpp>
#include <serial/TestSerial_SubView_c13.cpp>

View File

@ -0,0 +1,54 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <threads/TestThreads_Category.hpp>
#include <TestViewSubview.hpp>
namespace Test {
TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset )
{
TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >();
}
} // namespace Test

Binary file not shown.

View File

@ -31,6 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
OBJ_EXAMPLE_FEINT += feint_cuda.o
endif
ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
OBJ_EXAMPLE_FEINT += feint_rocm.o
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
OBJ_EXAMPLE_FEINT += feint_threads.o
endif

View File

@ -0,0 +1,67 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_ENABLE_ROCM )
#include <feint.hpp>
namespace Kokkos {
namespace Example {
template void feint<Kokkos::Experimental::ROCm,false>(
const unsigned global_elem_nx ,
const unsigned global_elem_ny ,
const unsigned global_elem_nz );
template void feint<Kokkos::Experimental::ROCm,true>(
const unsigned global_elem_nx ,
const unsigned global_elem_ny ,
const unsigned global_elem_nz );
} /* namespace Example */
} /* namespace Kokkos */
#endif

View File

@ -110,6 +110,27 @@ Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >(
#endif
#if defined( KOKKOS_ENABLE_ROCM )
template
Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >(
MPI_Comm comm ,
const int use_print ,
const int use_trials ,
const int use_atomic ,
const int global_elems[] );
template
Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemQuadratic >(
MPI_Comm comm ,
const int use_print ,
const int use_trials ,
const int use_atomic ,
const int global_elems[] );
#endif
} /* namespace FENL */
} /* namespace Example */

View File

@ -67,6 +67,7 @@ enum { CMD_USE_THREADS = 0
, CMD_USE_NUMA
, CMD_USE_CORE_PER_NUMA
, CMD_USE_CUDA
, CMD_USE_ROCM
, CMD_USE_OPENMP
, CMD_USE_CUDA_DEV
, CMD_USE_FIXTURE_X
@ -114,6 +115,9 @@ void print_cmdline( std::ostream & s , const int cmd[] )
if ( cmd[ CMD_USE_CUDA ] ) {
s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
}
if ( cmd[ CMD_USE_ROCM ] ) {
s << " ROCM" ;
}
if ( cmd[ CMD_USE_ATOMIC ] ) {
s << " ATOMIC" ;
}
@ -167,6 +171,7 @@ void run( MPI_Comm comm , const int cmd[] )
if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }
if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
else { std::cout << " , LINEAR-ELEMENT" ; }
@ -288,6 +293,9 @@ int main( int argc , char ** argv )
cmdline[ CMD_USE_CUDA ] = 1 ;
cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
}
else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
cmdline[ CMD_USE_ROCM ] = 1 ;
}
else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
sscanf( argv[++i] , "%dx%dx%d" ,
cmdline + CMD_USE_FIXTURE_X ,
@ -410,6 +418,21 @@ int main( int argc , char ** argv )
Kokkos::HostSpace::execution_space::finalize();
}
#endif
#if defined( KOKKOS_ENABLE_ROCM )
if ( cmdline[ CMD_USE_ROCM ] ) {
// Use the last device:
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );
run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
#endif
}

View File

@ -300,5 +300,16 @@ int main()
Kokkos::HostSpace::execution_space::finalize();
}
#endif
#if defined( KOKKOS_ENABLE_ROCM )
{
std::cout << "test_fixture< ROCm >" << std::endl ;
Kokkos::HostSpace::execution_space::initialize();
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
Kokkos::Example::test_fixture< Kokkos::Experimental::ROCm >();
Kokkos::Experimental::ROCm::finalize();
Kokkos::HostSpace::execution_space::finalize();
}
#endif
}

View File

@ -53,6 +53,10 @@ template void test_fixture< Kokkos::HostSpace::execution_space >();
template void test_fixture<Kokkos::Cuda>();
#endif
#if defined( KOKKOS_ENABLE_ROCM )
template void test_fixture<Kokkos::Experimental::ROCm>();
#endif
} /* namespace Example */
} /* namespace Kokkos */

View File

@ -23,6 +23,15 @@ EXE = 02_simple_reduce.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 02_simple_reduce.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -40,7 +49,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -24,6 +24,16 @@ EXE = 02_simple_reduce_lambda.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 02_simple_reduce_lambda.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -23,6 +23,16 @@ EXE = 03_simple_view.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 03_simple_view.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -24,6 +24,16 @@ EXE = 03_simple_view_lambda.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 03_simple_view_lambda.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -41,7 +51,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -23,6 +23,16 @@ EXE = 04_simple_memoryspaces.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 04_simple_memoryspaces.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -23,6 +23,16 @@ EXE = 05_simple_atomics.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 05_simple_atomics.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

View File

@ -0,0 +1,10 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
# This is a tutorial, not a test, so we don't ask CTest to run it.
TRIBITS_ADD_EXECUTABLE(
tutorial_06_simple_mdrangepolicy
SOURCES simple_mdrangepolicy.cpp
COMM serial mpi
)

View File

@ -0,0 +1,48 @@
KOKKOS_PATH = ../../..
KOKKOS_SRC_PATH = ${KOKKOS_PATH}
SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/06_simple_mdrangepolicy/*.cpp)
vpath %.cpp $(sort $(dir $(SRC)))
default: build
echo "Start Build"
ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES)))
CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 06_simple_mdrangepolicy.cuda
KOKKOS_DEVICES = "Cuda,OpenMP"
KOKKOS_ARCH = "SNB,Kepler35"
else
CXX = g++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 06_simple_mdrangepolicy.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
DEPFLAGS = -M
OBJ = $(notdir $(SRC:.cpp=.o))
LIB =
include $(KOKKOS_PATH)/Makefile.kokkos
build: $(EXE)
test: $(EXE)
./$(EXE)
$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
# Compilation rules
%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@)

View File

@ -0,0 +1,201 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#include <cstdio>
//
// MDRangePolicy example with parallel_for and parallel_reduce:
// 1. Start up Kokkos
// 2. Execute a parallel_for loop in the default execution space,
// using a functor to define the loop body
// 3. Shut down Kokkos
//
// Two examples are provided:
// Example 1: Rank 2 case with minimal default parameters and arguments used
// in the MDRangePolicy
//
// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters
// and tile dims passed to the ctor
// Simple functor for computing/storing the product of indices in a View v
template < class ViewType >
struct MDFunctor {
typedef long value_type;
ViewType v;
size_t size;
MDFunctor( const ViewType & v_, const size_t size_ )
: v(v_), size(size_) {}
// 2D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j) const {
v(i,j) = i*j; // compute the product of indices
}
// 3D case - used by parallel_for
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, const int k) const {
v(i,j,k) = i*j*k; // compute the product of indices
}
// 2D case - reduction
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, value_type & incorrect_count) const {
if ( v(i,j) != i*j ) {
incorrect_count += 1;
}
}
// 3D case - reduction
KOKKOS_INLINE_FUNCTION
void operator () (const int i, const int j, const int k, value_type & incorrect_count) const {
if ( v(i,j,k) != i*j*k ) {
incorrect_count += 1;
}
}
};
int main (int argc, char* argv[]) {
Kokkos::initialize (argc, argv);
// Bound(s) for MDRangePolicy
const int n = 100;
// ViewType typedefs for Rank<2>, Rank<3> for example usage
typedef double ScalarType;
typedef typename Kokkos::View<ScalarType**> ViewType_2D;
typedef typename Kokkos::View<ScalarType***> ViewType_3D;
/////////////////////////////////////////////////////////////////////////////
// Explanation of MDRangePolicy usage, template parameters, constructor arguments
//
// MDRangePolicy typedefs for Rank<2>, Rank<3> cases
// Required template parameters:
// Kokkos::Rank<N>: where N=rank
//
// Optional template parameters to Rank<...>:
// Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across tiles;
// defaults based on the execution space similar to Kokkos::Layout
// Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within tiles;
// defaults based on the execution space similar to Kokkos::Layout
//
// e.g. typedef Rank<2, Iterate::Left, Iterate::Left> rank2ll;
//
//
// Optional template parameters to MDRangePolicy:
// ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc.
//
// Kokkos::IndexType< T >: where T = int, long, unsigned int, etc.
//
// struct Tag{}: A user-provided tag for tagging functor operators
//
// e.g. 1: MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, Iterate::Left>, IndexType<int>, Tag > mdpolicy;
// e.g. 2: MDRangePolicy< Kokkos::Serial, rank2ll, IndexType<int>, Tag > mdpolicy;
//
//
// Required arguments to ctor:
// {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or std::initializer_list
// {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or std::initializer_list
//
// Optional arguments to ctor:
// {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or std::initializer_list
// defaults based on the execution space
//
// e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}};
//
/////////////////////////////////////////////////////////////////////////////
// Example 1:
long incorrect_count_2d = 0;
{
// Rank<2> Case: Rank is provided, all other parameters are default
typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<2> > MDPolicyType_2D;
// Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims defaulted
MDPolicyType_2D mdpolicy_2d( {{0,0}}, {{n,n}} );
// Construct a 2D view to store result of product of indices
ViewType_2D v2("v2", n, n);
// Execute parallel_for with rank 2 MDRangePolicy
Kokkos::parallel_for( "md2d", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n) );
// Check results with a parallel_reduce using the MDRangePolicy
Kokkos::parallel_reduce( "md2dredux", mdpolicy_2d, MDFunctor<ViewType_2D>(v2, n), incorrect_count_2d );
printf("Rank 2 MDRangePolicy incorrect count: %ld\n", incorrect_count_2d); // should be 0
}
// Example 2:
long incorrect_count_3d = 0;
{
// Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided
typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, Kokkos::Experimental::Iterate::Left> > MDPolicyType_3D;
// Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided
MDPolicyType_3D mdpolicy_3d( {{0,0,0}}, {{n,n,n}}, {{4,4,4}} );
// Construct a 3D view to store result of product of indices
ViewType_3D v3("v3", n, n, n);
// Execute parallel_for with rank 3 MDRangePolicy
Kokkos::parallel_for( "md3d", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n) );
// Check results with a parallel_reduce using the MDRangePolicy
Kokkos::parallel_reduce( "md3dredux", mdpolicy_3d, MDFunctor<ViewType_3D>(v3, n), incorrect_count_3d );
printf("Rank 3 MDRangePolicy incorrect count: %ld\n", incorrect_count_3d); // should be 0
}
Kokkos::finalize ();
return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 : -1;
}

View File

@ -23,6 +23,16 @@ EXE = 01_data_layouts.host
KOKKOS_DEVICES = "OpenMP"
KOKKOS_ARCH = "SNB"
endif
ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES)))
CXX = /opt/rocm/hcc/bin/clang++
CXXFLAGS = -O3
LINK = ${CXX}
LINKFLAGS =
EXE = 01_data_layouts.rocm
KOKKOS_DEVICES = "ROCm"
KOKKOS_ARCH = "Fiji"
endif
DEPFLAGS = -M
@ -40,7 +50,7 @@ $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
clean: kokkos-clean
rm -f *.o *.cuda *.host
rm -f *.o *.cuda *.host *.rocm
# Compilation rules

Some files were not shown because too many files have changed in this diff Show More