Snapshot of kokkos.git from commit 0a776f65e7429b875839719c4fe528c15e871e46

From repository at git@github.com:/kokkos/kokkos.git

At commit:
commit 0a776f65e7429b875839719c4fe528c15e871e46
Author: crtrott <crtrott@sandia.gov>
Date:   Thu Dec 10 11:51:50 2015 -0700

    Adding CUDA 7.5 as secondary compiler to README
This commit is contained in:
crtrott
2015-12-10 11:52:34 -07:00
parent 4099f540dd
commit 91a791bbb3
184 changed files with 41315 additions and 3847 deletions

8
lib/kokkos/.gitignore vendored Normal file
View File

@ -0,0 +1,8 @@
# Standard ignores
*~
*.pyc
\#*#
.#*
.*.swp
.cproject
.project

123
lib/kokkos/CMakeLists.txt Normal file
View File

@ -0,0 +1,123 @@
#
# A) Forward delcare the package so that certain options are also defined for
# subpackages
#
TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
#------------------------------------------------------------------------------
#
# B) Define the common options for Kokkos first so they can be used by
# subpackages as well.
#
TRIBITS_ADD_DEBUG_OPTION()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_SIERRA_BUILD
KOKKOS_FOR_SIERRA
"Configure Kokkos for building within the Sierra build system."
OFF
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda
KOKKOS_HAVE_CUDA
"Enable CUDA support in Kokkos."
"${TPL_ENABLE_CUDA}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Cuda_UVM
KOKKOS_USE_CUDA_UVM
"Enable CUDA Unified Virtual Memory support in Kokkos."
OFF
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Pthread
KOKKOS_HAVE_PTHREAD
"Enable Pthread support in Kokkos."
"${TPL_ENABLE_Pthread}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_OpenMP
KOKKOS_HAVE_OPENMP
"Enable OpenMP support in Kokkos."
"${${PROJECT_NAME}_ENABLE_OpenMP}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_QTHREAD
KOKKOS_HAVE_QTHREAD
"Enable QTHREAD support in Kokkos."
"${TPL_ENABLE_QTHREAD}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_CXX11
KOKKOS_HAVE_CXX11
"Enable C++11 support in Kokkos."
"${${PROJECT_NAME}_ENABLE_CXX11}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_HWLOC
KOKKOS_HAVE_HWLOC
"Enable HWLOC support in Kokkos."
"${TPL_ENABLE_HWLOC}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_MPI
KOKKOS_HAVE_MPI
"Enable MPI support in Kokkos."
"${TPL_ENABLE_MPI}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Debug_Bounds_Check
KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
"Enable bounds checking support in Kokkos."
OFF
)
#TRIBITS_ADD_OPTION_AND_DEFINE(
# Kokkos_ENABLE_Profiling_Collect_Kernel_Data
# KOKKOS_ENABLE_PROFILING_COLLECT_KERNEL_DATA
# "Enable profiling support for kernel data collections in Kokkos."
# "${${PROJECT_NAME}_ENABLE_KokkosProfiler}"
# )
# placeholder for future device...
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Winthread
KOKKOS_HAVE_WINTHREAD
"Enable Winthread support in Kokkos."
"${TPL_ENABLE_Winthread}"
)
#------------------------------------------------------------------------------
#
# C) Process the subpackages for Kokkos
#
TRIBITS_PROCESS_SUBPACKAGES()
#
# D) If Kokkos itself is enabled, process the Kokkos package
#
TRIBITS_PACKAGE_DEF()
TRIBITS_EXCLUDE_AUTOTOOLS_FILES()
TRIBITS_EXCLUDE_FILES(
classic/doc
classic/LinAlg/doc/CrsRefactorNotesMay2012
)
TRIBITS_PACKAGE_POSTPROCESS()

View File

@ -0,0 +1,73 @@
Developers of Kokkos (those who commit modifications to Kokkos)
must maintain the snapshot of Kokkos in the Trilinos repository.
This file contains instructions for how to
snapshot Kokkos from github.com/kokkos to Trilinos.
------------------------------------------------------------------------
*** EVERYTHING GOES RIGHT WORKFLOW ***
1) Given a 'git clone' of Kokkos and of Trilinos repositories.
1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone.
This path *must* terminate with the directory name 'kokkos';
e.g., ${HOME}/kokkos .
1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory.
2) Given that the Kokkos build & test is clean and
changes are committed to the Kokkos clone.
3) Snapshot the current commit in the Kokkos clone into the Trilinos clone.
This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}:
${KOKKOS}/config/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages
4) Verify the snapshot commit happened as expected
cd ${TRILINOS}/packages/kokkos
git log -1 --name-only
5) Modify, build, and test Trilinos with the Kokkos snapshot.
6) Given that that the Trilinos build & test is clean and
changes are committed to the Trilinos clone.
7) Attempt push to the Kokkos repository.
If push fails then you must 'remove the Kokkos snapshot'
from your Trilinos clone.
See below.
8) Attempt to push to the Trilinos repository.
If updating for a failed push requires you to change Kokkos you must
'remove the Kokkos snapshot' from your Trilinos clone.
See below.
------------------------------------------------------------------------
*** WHEN SOMETHING GOES WRONG AND YOU MUST ***
*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE ***
1) Query the Trilinos clone commit log.
git log --oneline
2) Note the <SHA1> of the commit to the Trillinos clone
immediately BEFORE the Kokkos snapshot commit.
Copy this <SHA1> for use in the next command.
3) IF more than one outstanding commit then you can remove just the
Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file.
Remove or comment out the Kokkos snapshot commit entry.
git rebase -i <SHA1>
4) IF the Kokkos snapshot commit is the one and only
outstanding commit then remove just than commit.
git reset --hard HEAD~1
------------------------------------------------------------------------
*** REGARDING 'snapshot.py' TOOL ***
The 'snapshot.py' tool is developed and maintained by the
Center for Computing Research (CCR)
Software Engineering, Maintenance, and Support (SEMS) team.
Contact Brent Perschbacher <bmpersc@sandia.gov> for questions>
------------------------------------------------------------------------

View File

@ -1,20 +1,18 @@
# Default settings common options
KOKKOS_PATH=../../lib/kokkos
#Options: OpenMP,Serial,Pthreads,Cuda
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8
#KOKKOS_DEVICES ?= "OpenMP"
KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
KOKKOS_ARCH ?= ""
#Options: yes,no
KOKKOS_DEBUG ?= "no"
#Options: hwloc,librt
#Options: hwloc,librt,experimental_memkind
KOKKOS_USE_TPLS ?= ""
#Options: c++11
KOKKOS_CXX_STANDARD ?= "c++11"
#Options: kernel_times,aggregate_mpi
KOKKOS_PROFILING ?= ""
#Options: aggressive_vectorization
KOKKOS_OPTIONS ?= ""
#Default settings specific options
#Options: force_uvm,use_ldg,rdc,enable_lambda
@ -30,8 +28,10 @@ KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | gr
# Check for external libraries
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
# Check for advanced settings
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
@ -50,10 +50,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
endif
endif
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
@ -93,8 +94,10 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
#NVIDIA based
NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
@ -135,8 +138,9 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
#Any AVX?
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
#Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
@ -225,6 +229,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
KOKKOS_LIBS += -lrt
endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_LIBS += -lmemkind
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
endif
tmp := $(shell echo "/* Optimization Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
tmp := $(shell echo "\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION 1" >> KokkosCore_config.tmp )
endif
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
@ -265,8 +282,41 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
KOKKOS_CXXFLAGS += -march=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX2
KOKKOS_LDFLAGS += -xCORE-AVX2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler
KOKKOS_CXXFLAGS += -march=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xMIC-AVX512
KOKKOS_LDFLAGS += -xMIC-AVX512
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Asssume that this is really a GNU compiler
KOKKOS_CXXFLAGS += -march=knl
KOKKOS_LDFLAGS += -march=knl
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)

View File

@ -55,3 +55,8 @@ Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp

View File

@ -20,6 +20,13 @@ GTC 2015:
A programming guide can be found under doc/Kokkos_PG.pdf. This is an initial version
and feedback is greatly appreciated.
A separate repository with extensive tutorial material can be found under
https://github.com/kokkos/kokkos-tutorials.
If you have a patch to contribute please feel free to issue a pull request against
the develop branch. For major contributions it is better to contact us first
for guidance.
For questions please send an email to
kokkos-users@software.sandia.gov
@ -43,6 +50,7 @@ Primary tested compilers are:
Secondary tested compilers are:
CUDA 6.5 (with gcc 4.7.2)
CUDA 7.0 (with gcc 4.7.2)
CUDA 7.5 (with gcc 4.7.2)
Other compilers working:
PGI 15.4

View File

@ -0,0 +1,10 @@
TRIBITS_SUBPACKAGE(Algorithms)
ADD_SUBDIRECTORY(src)
TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
#TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
TRIBITS_SUBPACKAGE_POSTPROCESS()

View File

@ -0,0 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -0,0 +1,4 @@
#ifndef KOKKOS_ALGORITHMS_CONFIG_H
#define KOKKOS_ALGORITHMS_CONFIG_H
#endif

View File

@ -0,0 +1,21 @@
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS *.hpp)
FILE(GLOB SOURCES *.cpp)
LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
#-----------------------------------------------------------------------------
TRIBITS_ADD_LIBRARY(
kokkosalgorithms
HEADERS ${HEADERS}
SOURCES ${SOURCES}
DEPLIBS
)

View File

@ -45,7 +45,7 @@
#define KOKKOS_RANDOM_HPP
#include <Kokkos_Core.hpp>
//#include <Kokkos_Complex.hpp>
#include <Kokkos_Complex.hpp>
#include <cstdio>
#include <cstdlib>
#include <cmath>
@ -475,6 +475,58 @@ namespace Kokkos {
};
template<class Generator>
struct rand<Generator, ::Kokkos::complex<float> > {
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> max () {
return ::Kokkos::complex<float> (1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen) {
const float re = gen.frand ();
const float im = gen.frand ();
return ::Kokkos::complex<float> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& range) {
const float re = gen.frand (real (range));
const float im = gen.frand (imag (range));
return ::Kokkos::complex<float> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& start, const ::Kokkos::complex<float>& end) {
const float re = gen.frand (real (start), real (end));
const float im = gen.frand (imag (start), imag (end));
return ::Kokkos::complex<float> (re, im);
}
};
template<class Generator>
struct rand<Generator, ::Kokkos::complex<double> > {
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> max () {
return ::Kokkos::complex<double> (1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen) {
const double re = gen.drand ();
const double im = gen.drand ();
return ::Kokkos::complex<double> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& range) {
const double re = gen.drand (real (range));
const double im = gen.drand (imag (range));
return ::Kokkos::complex<double> (re, im);
}
KOKKOS_INLINE_FUNCTION
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& start, const ::Kokkos::complex<double>& end) {
const double re = gen.drand (real (start), real (end));
const double im = gen.drand (imag (start), imag (end));
return ::Kokkos::complex<double> (re, im);
}
};
template<class DeviceType>
class Random_XorShift64_Pool;

View File

@ -0,0 +1,38 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES
UnitTestMain.cpp
TestCuda.cpp
)
SET(LIBRARIES kokkoscore)
IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES
TestOpenMP.cpp
)
ENDIF()
IF(Kokkos_ENABLE_Serial)
LIST( APPEND SOURCES
TestSerial.cpp
)
ENDIF()
IF(Kokkos_ENABLE_Pthread)
LIST( APPEND SOURCES
TestThreads.cpp
)
ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)

View File

@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
default: build_all
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
@ -56,7 +56,7 @@ KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Threads
KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_OpenMP
@ -74,11 +74,11 @@ test-openmp: KokkosAlgorithms_UnitTest_OpenMP
test-serial: KokkosAlgorithms_UnitTest_Serial
./KokkosAlgorithms_UnitTest_Serial
build_all: $(TARGETS)
test: $(TEST_TARGETS)
clean: kokkos-clean
rm -f *.o $(TARGETS)

View File

@ -0,0 +1,10 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
#SubPackageName Directory Class Req/Opt
#
# New Kokkos subpackages:
Core core PS REQUIRED
Containers containers PS OPTIONAL
Algorithms algorithms PS OPTIONAL
Example example EX OPTIONAL
)

View File

@ -0,0 +1,75 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
# Check for CUDA support
IF (NOT TPL_ENABLE_CUDA OR CUDA_VERSION VERSION_LESS "4.1")
MESSAGE(FATAL_ERROR "\nCUSPARSE: did not find acceptable version of CUDA libraries (4.1 or greater)")
ELSE()
IF(CMAKE_VERSION VERSION_LESS "2.8.8")
# FindCUDA before CMake 2.8.8 does not find cusparse library; therefore, we must
find_library(CUDA_cusparse_LIBRARY
cusparse
HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
)
IF(CUDA_cusparse_LIBRARY STREQUAL "CUDA_cusparse_LIBRARY-NOTFOUND")
MESSAGE(FATAL_ERROR "\nCUSPARSE: could not find cuspasre library.")
ENDIF()
ENDIF(CMAKE_VERSION VERSION_LESS "2.8.8")
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
ENDIF()

View File

@ -0,0 +1,71 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: November 2011
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: http://www.open-mpi.org/projects/hwloc/
# Version: 1.3
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
REQUIRED_HEADERS hwloc.h
REQUIRED_LIBS_NAMES "hwloc"
)

View File

@ -0,0 +1,82 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
SET(USE_THREADS FALSE)
IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
# Use CMake's Thread finder since it is a bit smarter in determining
# whether pthreads is already built into the compiler and doesn't need
# a library to link.
FIND_PACKAGE(Threads)
#If Threads found a copy of pthreads make sure it is one of the cases the tribits
#tpl system cannot handle.
IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
SET(USE_THREADS TRUE)
ENDIF()
ENDIF()
ENDIF()
IF(USE_THREADS)
SET(TPL_Pthread_INCLUDE_DIRS "")
SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
SET(TPL_Pthread_LIBRARY_DIRS "")
ELSE()
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
REQUIRED_HEADERS pthread.h
REQUIRED_LIBS_NAMES pthread
)
ENDIF()

View File

@ -0,0 +1,70 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: July 2014
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: https://code.google.com/p/qthreads
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
REQUIRED_HEADERS qthread.h
REQUIRED_LIBS_NAMES "qthread"
)

View File

@ -23,36 +23,72 @@ default_arch="sm_35"
#
# The default C++ compiler.
#
default_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
#default_compiler="icpc"
#default_compiler="/usr/local/gcc/4.8.3/bin/g++"
#default_compiler="/usr/local/gcc/4.9.1/bin/g++"
host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
#host_compiler="icpc"
#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
#
# Internal variables
#
# C++ files
cpp_files=""
# Host compiler arguments
xcompiler_args=""
cuda_arg=""
# Cuda (NVCC) only arguments
cuda_args=""
# Arguments for both NVCC and Host compiler
shared_args=""
# Linker arguments
xlinker_args=""
# Object files passable to NVCC
object_files=""
# Link objects for the host linker only
object_files_xlinker=""
first_host_option=1
# Does the User set the architecture
arch_set=0
# Does the user overwrite the host compiler
ccbin_set=0
nvcc_error_code=0
#Error code of compilation
error_code=0
# Do a dry run without actually compiling
dry_run=0
# Skip NVCC compilation and use host compiler directly
host_only=0
# Enable workaround for CUDA 6.5 for pragma ident
replace_pragma_ident=0
# Mark first host compiler argument
first_xcompiler_arg=1
temp_dir=${TMPDIR:-/tmp}
#echo "Arguments: $# $@"
while [ $# -gt 0 ]
do
case $1 in
#show the executed command
--show)
--show|--nvcc-wrapper-show)
dry_run=1
;;
#run host compilation only
--host-only)
host_only=1
;;
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
--replace-pragma-ident)
replace_pragma_ident=1
@ -61,22 +97,31 @@ do
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
cpp_files="$cpp_files $1"
;;
#Handle shared args (valid for both nvcc and the host compiler)
-O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
shared_args="$shared_args $1"
;;
#Handle shared args that have an argument
-o)
shared_args="$shared_args $1 $2"
shift
;;
#Handle known nvcc args
-O*|-D*|-gencode*|-c|-I*|-L*|-l*|-g|--help|--version|--dryrun|--verbose|--keep-dir|-E|-M|-G|--relocatable-device-code*|-shared|-lineinfo|-expt-extended-lambda|--resource-usage)
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
cuda_args="$cuda_args $1"
;;
#Handle known nvcc args that have an argument
-rdc|-maxrregcount|--default-stream)
cuda_args="$cuda_args $1 $2"
shift
;;
#Handle c++11 setting
--std=c++11|-std=c++11)
cuda_args="$cuda_args $1"
shared_args="$shared_args $1"
;;
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
-std=c++98|--std=c++98)
;;
#Handle known nvcc args that have an argument
-o|-rdc|-maxrregcount|--default-stream)
cuda_args="$cuda_args $1 $2"
shift
;;
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
-pedantic|-Wpedantic|-ansi)
;;
@ -86,7 +131,12 @@ do
#strip of "-x cu" because we add that
-x)
if [[ $2 != "cu" ]]; then
xcompiler_args="$xcompiler_args,-x,$2"
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args="-x,$2"
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,-x,$2"
fi
fi
shift
;;
@ -94,6 +144,7 @@ do
-ccbin)
cuda_args="$cuda_args $1 $2"
ccbin_set=1
host_compiler=$2
shift
;;
#Handle -arch argument (if its not set use a default
@ -109,24 +160,25 @@ do
#Handle args that should be sent to the linker
-Wl*)
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
host_linker_args="$host_linker_args ${1:4:${#1}}"
;;
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
*.a|*.so|*.o|*.obj)
object_files="$object_files $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
*.so.*|*.dylib)
object_files_xlinker="$object_files_xlinker -Xlinker $1"
object_files="$object_files -Xlinker $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#All other args are sent to the host compiler
*)
if [ $first_host_option -eq 0 ]; then
if [ $first_xcompiler_arg -eq 1 ]; then
xcompiler_args=$1
first_xcompiler_arg=0
else
xcompiler_args="$xcompiler_args,$1"
else
xcompiler_args="-Xcompiler $1"
first_host_option=0
fi
;;
esac
@ -136,7 +188,7 @@ done
#Add default host compiler if necessary
if [ $ccbin_set -ne 1 ]; then
cuda_args="$cuda_args -ccbin $default_compiler"
cuda_args="$cuda_args -ccbin $host_compiler"
fi
#Add architecture command
@ -145,7 +197,13 @@ if [ $arch_set -ne 1 ]; then
fi
#Compose compilation command
command="nvcc $cuda_args $xlinker_args $xcompiler_args"
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
if [ $first_xcompiler_arg -eq 0 ]; then
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
fi
#Compose host only command
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
if [ $replace_pragma_ident -eq 1 ]; then
@ -155,31 +213,45 @@ if [ $replace_pragma_ident -eq 1 ]; then
var=`grep pragma ${file} | grep ident | grep "#"`
if [ "${#var}" -gt 0 ]
then
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > /tmp/nvcc_wrapper_tmp_$file
cpp_files2="$cpp_files2 /tmp/nvcc_wrapper_tmp_$file"
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
else
cpp_files2="$cpp_files2 $file"
fi
done
cpp_files=$cpp_files2
echo $cpp_files
#echo $cpp_files
fi
if [ "$cpp_files" ]; then
command="$command $object_files_xlinker -x cu $cpp_files"
nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
else
command="$command $object_files"
nvcc_command="$nvcc_command $object_files"
fi
if [ "$cpp_files" ]; then
host_command="$host_command $object_files $cpp_files"
else
host_command="$host_command $object_files"
fi
#Print command for dryrun
if [ $dry_run -eq 1 ]; then
echo $command
if [ $host_only -eq 1 ]; then
echo $host_command
else
echo $nvcc_command
fi
exit 0
fi
#Run compilation command
$command
nvcc_error_code=$?
if [ $host_only -eq 1 ]; then
$host_command
else
$nvcc_command
fi
error_code=$?
#Report error code
exit $nvcc_error_code
exit $error_code

View File

@ -6,8 +6,6 @@
set -o pipefail
COMPILER_ROOT="/home/projects/x86-64"
GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
@ -18,24 +16,17 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CUDA_WARNING_FLAGS=""
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 gcc/4.7.2/base,hwloc/1.10.0/host/gnu/4.7.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 gcc/4.9.2/base,hwloc/1.10.0/host/gnu/4.9.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 gcc/4.9.2/base,hwloc/1.10.0/host/gnu/4.9.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 gcc/5.1.0/base,hwloc/1.10.0/host/gnu/5.1.0 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 intel/14.0.4/base,hwloc/1.10.0/host/gnu/4.7.2 $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 intel/15.0.2/base,hwloc/1.10.0/host/gnu/4.7.2 $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 clang/3.5.2/base $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 clang/3.6.1/base $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 cuda/6.5.14,nvcc-wrapper/gnu,gcc/4.7.2/base $CUDA_BUILD_LIST nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 cuda/7.0.18,nvcc-wrapper/gnu,gcc/4.7.2/base $CUDA_BUILD_LIST nvcc_wrapper $CUDA_WARNING_FLAGS"
)
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
export OMP_NUM_THREADS=4
export SEMS_MODULE_ROOT=/projects/modulefiles
module use /home/projects/modulefiles
module use /projects/modulefiles/rhel6-x86_64/sems/compiler
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
@ -47,6 +38,9 @@ DEBUG=False
ARGS=""
CUSTOM_BUILD_LIST=""
DRYRUN=False
BUILD_ONLY=False
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
TEST_SCRIPT=False
while [[ $# > 0 ]]
do
@ -61,6 +55,15 @@ CUSTOM_BUILD_LIST="${key#*=}"
--debug*)
DEBUG=True
;;
--build-only*)
BUILD_ONLY=True
;;
--test-script*)
TEST_SCRIPT=True
;;
--num*)
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
;;
--dry-run*)
DRYRUN=True
;;
@ -69,7 +72,10 @@ echo "test_all_sandia <ARGS> <OPTIONS>:"
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script"
echo "--debug: Run tests in debug. Defaults to False"
echo "--test-script: Test this script, not Kokkos"
echo "--num=N: Number of jobs to run in parallel "
echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything"
echo "--build-list=BUILD,BUILD,BUILD..."
echo " Provide a comma-separated list of builds instead of running all builds"
echo " Valid items:"
@ -77,6 +83,18 @@ echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
echo ""
echo "ARGS: list of expressions matching compilers to test"
echo " supported compilers"
echo " gcc/4.7.2"
echo " gcc/4.8.4"
echo " gcc/4.9.2"
echo " gcc/5.1.0"
echo " intel/14.0.4"
echo " intel/15.0.2"
echo " clang/3.5.2"
echo " clang/3.6.1"
echo " cuda/6.5.14"
echo " cuda/7.0.28"
echo " cuda/7.5.18"
echo ""
echo "Examples:"
echo " Run all tests"
@ -93,6 +111,10 @@ echo " % test_all_sandia --debug"
echo ""
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
echo ""
echo "If you want to kill the tests, do:"
echo " hit ctrl-z"
echo " % kill -9 %1"
echo
exit 0
;;
@ -104,7 +126,6 @@ esac
shift
done
# set kokkos path
if [ -z "$KOKKOS_PATH" ]; then
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
@ -125,12 +146,26 @@ if [ -z "$ARGS" ]; then
ARGS='?'
fi
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
# Process args to figure out which compilers to test
COMPILERS_TO_TEST=""
for ARG in $ARGS; do
for COMPILER_DATA in "${COMPILERS[@]}"; do
arr=($COMPILER_DATA)
COMPILER=${arr[0]}
ARR=($COMPILER_DATA)
COMPILER=${ARR[0]}
if [[ "$COMPILER" = $ARG* ]]; then
if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
@ -145,15 +180,28 @@ done
# Functions
#
# get_compiler_name <COMPILER>
get_compiler_name() {
echo $1 | cut -d/ -f1
}
# get_compiler_version <COMPILER>
get_compiler_version() {
echo $1 | cut -d/ -f2
}
# Do not call directly
get_compiler_data() {
compiler=$1
item=$2
local compiler=$1
local item=$2
local compiler_name=$(get_compiler_name $compiler)
local compiler_vers=$(get_compiler_version $compiler)
local compiler_data
for compiler_data in "${COMPILERS[@]}" ; do
arr=($compiler_data)
local arr=($compiler_data)
if [ "$compiler" = "${arr[0]}" ]; then
echo "${arr[$item]}" | tr , ' '
echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
return 0
fi
done
@ -186,33 +234,60 @@ get_compiler_warning_flags() {
run_cmd() {
echo "RUNNING: $*"
if [ "$DRYRUN" != "True" ]; then
eval "$*"
eval "$* 2>&1"
fi
}
# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
report_and_log_test_result() {
if [ "$1" = "0" ]; then
echo "PASSED $2"
TEST_RESULTS="${TEST_RESULTS}\nPASSED $2"
# Use sane var names
local success=$1; local desc=$2; local phase=$3;
if [ "$success" = "0" ]; then
echo " PASSED $desc"
touch $PASSED_DIR/$desc
else
echo "FAILED $2" >&2
TEST_RESULTS="${TEST_RESULTS}\nFAILED $2 ($3)"
NUM_FAILED+=1
echo " FAILED $desc" >&2
echo $phase > $FAILED_DIR/$desc
cat ${desc}.${phase}.log
fi
}
setup_env() {
local compiler=$1
local compiler_modules=$(get_compiler_modules $compiler)
module purge
local mod
for mod in $compiler_modules; do
module load $mod 2>&1
# It is ridiculously hard to check for the success of a loaded
# module. Module does not return error codes and piping to grep
# causes module to run in a subshell.
module list 2>&1 | grep "$mod" >& /dev/null || return 1
done
return 0
}
# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
single_build_and_test() {
# Use sane var names
local compiler=$1; local build=$2; local build_type=$3;
cd $ROOT_DIR/$compiler
# set up env
mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
cd $ROOT_DIR/$compiler/"${build}-$build_type"
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
# Set up flags
local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
local compiler_exe=$(get_compiler_exe_name $compiler)
if [[ "$build_type" = hwloc* ]]; then
local extra_args="--with-hwloc=$HWLOC_ROOT"
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
fi
if [[ "$build_type" = *debug* ]]; then
@ -222,36 +297,63 @@ single_build_and_test() {
local cxxflags="-O3 $compiler_warning_flags"
fi
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
echo " Doing build: $desc"
mkdir "${build}-$build_type"
cd "${build}-$build_type"
if [[ "$compiler" == cuda* ]]; then
cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
export TMPDIR=$(pwd)
fi
# cxxflags="-DKOKKOS_USING_EXPERIMENTAL_VIEW $cxxflags"
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" \"$extra_args\" 2>&1 | tee ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
run_cmd make build-test 2>&1 | tee ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
run_cmd make test 2>&1 | tee ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
echo " Starting job $desc"
if [ "$TEST_SCRIPT" = "True" ]; then
local rand=$[ 1 + $[ RANDOM % 10 ]]
sleep $rand
if [ $rand -gt 5 ]; then
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
fi
else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
if [[ "$BUILD_ONLY" == False ]]; then
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
fi
fi
report_and_log_test_result 0 $desc
return 0
}
setup_env() {
local compiler=$1
local compiler_modules=$(get_compiler_modules $compiler)
module purge
for mod in $compiler_modules; do
module load $mod
# It is ridiculously hard to check for the success of a loaded
# module. Module does not return error codes and piping to grep
# causes module to run in a subshell.
module list 2>&1 | grep "$mod"
# wait_for_jobs <NUM-JOBS>
wait_for_jobs() {
local -i max_jobs=$1
local -i num_active_jobs=$(jobs | wc -l)
while [ $num_active_jobs -ge $max_jobs ]
do
sleep 1
num_active_jobs=$(jobs | wc -l)
jobs >& /dev/null
done
}
# run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
run_in_background() {
local compiler=$1
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
if [[ "$BUILD_ONLY" == True ]]; then
num_jobs=8
else
if [[ "$compiler" == cuda* ]]; then
num_jobs=1
fi
fi
wait_for_jobs $num_jobs
single_build_and_test $* &
}
# build_and_test_all <COMPILER>
build_and_test_all() {
# Get compiler data
@ -262,44 +364,74 @@ build_and_test_all() {
local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
fi
# set up env
cd $ROOT_DIR
mkdir -p $compiler
setup_env $compiler
# do builds
local build
for build in $compiler_build_list
do
single_build_and_test $compiler $build $BUILD_TYPE
run_in_background $compiler $build $BUILD_TYPE
# If not cuda, do a hwloc test too
if [[ "$compiler" != cuda* ]]; then
single_build_and_test $compiler $build "hwloc-$BUILD_TYPE"
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
fi
done
return 0
}
get_test_root_dir() {
local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
if [ $num_to_delete -gt 0 ]; then
/bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
fi
echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
}
wait_summarize_and_exit() {
wait_for_jobs 1
echo "#######################################################"
echo "PASSED TESTS"
echo "#######################################################"
\ls -1 $PASSED_DIR | sort
echo "#######################################################"
echo "FAILED TESTS"
echo "#######################################################"
local failed_test
local -i rv=0
for failed_test in $(\ls -1 $FAILED_DIR)
do
echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
rv=$rv+1
done
exit $rv
}
#
# Main
#
/bin/rm -rf TestAll
mkdir TestAll
cd TestAll
ROOT_DIR=$(get_test_root_dir)
mkdir -p $ROOT_DIR
cd $ROOT_DIR
TEST_RESULTS=""
declare -i NUM_FAILED=0
ROOT_DIR=$(pwd)
PASSED_DIR=$ROOT_DIR/results/passed
FAILED_DIR=$ROOT_DIR/results/failed
mkdir -p $PASSED_DIR
mkdir -p $FAILED_DIR
echo "Going to test compilers: " $COMPILERS_TO_TEST
for COMPILER in $COMPILERS_TO_TEST; do
echo "Testing compiler $COMPILER"
build_and_test_all $COMPILER
done
echo "#######################################################"
echo "RESULT SUMMARY"
echo "#######################################################"
echo -e $TEST_RESULTS
exit $NUM_FAILED
wait_summarize_and_exit

View File

@ -0,0 +1,287 @@
#! /usr/bin/env python
"""
Compute the size at which the current compiler will start to
significantly scale back optimization.
The CPP file being modified will need the following tags.
// JGF_DUPLICATE_BEGIN - Put before start of function to duplicate
// JGF_DUPLICATE_END - Put after end of function to duplcate
// JGF_DUPE function_name(args); - Put anywhere where it's legal to
put a function call but not in your timing section.
The program will need to output the string:
FOM: <number>
This will represent the program's performance
"""
import argparse, sys, os, doctest, subprocess, re, time
VERBOSE = False
###############################################################################
def parse_command_line(args, description):
###############################################################################
parser = argparse.ArgumentParser(
usage="""\n%s <cppfile> <build-command> <run-command> [--verbose]
OR
%s --help
OR
%s --test
\033[1mEXAMPLES:\033[0m
> %s foo.cpp 'make -j4' foo
""" % ((os.path.basename(args[0]), ) * 4),
description=description,
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("cppfile", help="Name of file to modify.")
parser.add_argument("buildcmd", help="Build command")
parser.add_argument("execmd", help="Run command")
parser.add_argument("-v", "--verbose", action="store_true",
help="Print extra information")
parser.add_argument("-s", "--start", type=int, default=1,
help="Starting number of dupes")
parser.add_argument("-e", "--end", type=int, default=1000,
help="Ending number of dupes")
parser.add_argument("-n", "--repeat", type=int, default=10,
help="Number of times to repeat an individial execution. Best value will be taken.")
parser.add_argument("-t", "--template", action="store_true",
help="Use templating instead of source copying to increase object size")
parser.add_argument("-c", "--csv", action="store_true",
help="Print results as CSV")
args = parser.parse_args(args[1:])
if (args.verbose):
global VERBOSE
VERBOSE = True
return args.cppfile, args.buildcmd, args.execmd, args.start, args.end, args.repeat, args.template, args.csv
###############################################################################
def verbose_print(msg, override=None):
###############################################################################
if ( (VERBOSE and not override is False) or override):
print msg
###############################################################################
def error_print(msg):
###############################################################################
print >> sys.stderr, msg
###############################################################################
def expect(condition, error_msg):
###############################################################################
"""
Similar to assert except doesn't generate an ugly stacktrace. Useful for
checking user error, not programming error.
"""
if (not condition):
raise SystemExit("FAIL: %s" % error_msg)
###############################################################################
def run_cmd(cmd, ok_to_fail=False, input_str=None, from_dir=None, verbose=None,
arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE):
###############################################################################
verbose_print("RUN: %s" % cmd, verbose)
if (input_str is not None):
stdin = subprocess.PIPE
else:
stdin = None
proc = subprocess.Popen(cmd,
shell=True,
stdout=arg_stdout,
stderr=arg_stderr,
stdin=stdin,
cwd=from_dir)
output, errput = proc.communicate(input_str)
output = output.strip() if output is not None else output
stat = proc.wait()
if (ok_to_fail):
return stat, output, errput
else:
if (arg_stderr is not None):
errput = errput if errput is not None else open(arg_stderr.name, "r").read()
expect(stat == 0, "Command: '%s' failed with error '%s'" % (cmd, errput))
else:
expect(stat == 0, "Command: '%s' failed. See terminal output" % cmd)
return output
###############################################################################
def build_and_run(source, cppfile, buildcmd, execmd, repeat):
###############################################################################
open(cppfile, 'w').writelines(source)
run_cmd(buildcmd)
best = None
for i in xrange(repeat):
wait_for_quiet_machine()
output = run_cmd(execmd)
current = None
fom_regex = re.compile(r'^FOM: ([0-9.]+)$')
for line in output.splitlines():
m = fom_regex.match(line)
if (m is not None):
current = float(m.groups()[0])
break
expect(current is not None, "No lines in output matched FOM regex")
if (best is None or best < current):
best = current
return best
###############################################################################
def wait_for_quiet_machine():
###############################################################################
while(True):
time.sleep(2)
# The first iteration of top gives garbage results
idle_pct_raw = run_cmd("top -bn2 | grep 'Cpu(s)' | tr ',' ' ' | tail -n 1 | awk '{print $5}'")
idle_pct_re = re.compile(r'^([0-9.]+)%id$')
m = idle_pct_re.match(idle_pct_raw)
expect(m is not None, "top not returning output in expected form")
idle_pct = float(m.groups()[0])
if (idle_pct < 95):
error_print("Machine is too busy, waiting for it to become free")
else:
break
###############################################################################
def add_n_dupes(curr_lines, num_dupes, template):
###############################################################################
function_name = None
function_invocation = None
function_lines = []
function_re = re.compile(r'^.* (\w+) *[(]')
function_inv_re = re.compile(r'^.*JGF_DUPE: +(.+)$')
# Get function lines
record = False
definition_insertion_point = None
invocation_insertion_point = None
for idx, line in enumerate(curr_lines):
if ("JGF_DUPLICATE_BEGIN" in line):
record = True
m = function_re.match(curr_lines[idx+1])
expect(m is not None, "Could not find function in line '%s'" % curr_lines[idx+1])
function_name = m.groups()[0]
elif ("JGF_DUPLICATE_END" in line):
record = False
definition_insertion_point = idx + 1
elif (record):
function_lines.append(line)
elif ("JGF_DUPE" in line):
m = function_inv_re.match(line)
expect(m is not None, "Could not find function invocation example in line '%s'" % line)
function_invocation = m.groups()[0]
invocation_insertion_point = idx + 1
expect(function_name is not None, "Could not find name of dupe function")
expect(function_invocation is not None, "Could not find function invocation point")
expect(definition_insertion_point < invocation_insertion_point, "fix me")
dupe_func_defs = []
dupe_invocations = ["int jgf_rand = std::rand();\n", "if (false) {}\n"]
for i in xrange(num_dupes):
if (not template):
dupe_func = list(function_lines)
dupe_func[0] = dupe_func[0].replace(function_name, "%s%d" % (function_name, i))
dupe_func_defs.extend(dupe_func)
dupe_invocations.append("else if (jgf_rand == %d) " % i)
if (template):
dupe_call = function_invocation.replace(function_name, "%s<%d>" % (function_name, i)) + "\n"
else:
dupe_call = function_invocation.replace(function_name, "%s%d" % (function_name, i)) + "\n"
dupe_invocations.append(dupe_call)
curr_lines[invocation_insertion_point:invocation_insertion_point] = dupe_invocations
curr_lines[definition_insertion_point:definition_insertion_point] = dupe_func_defs
###############################################################################
def report(num_dupes, curr_lines, object_file, orig_fom, curr_fom, csv=False, is_first_report=False):
###############################################################################
fom_change = (curr_fom - orig_fom) / orig_fom
if (csv):
if (is_first_report):
print "num_dupes, obj_byte_size, loc, fom, pct_diff"
print "%s, %s, %s, %s, %s" % (num_dupes, os.path.getsize(object_file), len(curr_lines), curr_fom, fom_change*100)
else:
print "========================================================"
print "For number of dupes:", num_dupes
print "Object file size (bytes):", os.path.getsize(object_file)
print "Lines of code:", len(curr_lines)
print "Field of merit:", curr_fom
print "Change pct:", fom_change*100
###############################################################################
def obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv=False):
###############################################################################
orig_source_lines = open(cppfile, 'r').readlines()
backup_file = "%s.orig" % cppfile
object_file = "%s.o" % os.path.splitext(cppfile)[0]
os.rename(cppfile, backup_file)
orig_fom = build_and_run(orig_source_lines, cppfile, buildcmd, execmd, repeat)
report(0, orig_source_lines, object_file, orig_fom, orig_fom, csv=csv, is_first_report=True)
i = start
while (i < end):
curr_lines = list(orig_source_lines)
add_n_dupes(curr_lines, i, template)
curr_fom = build_and_run(curr_lines, cppfile, buildcmd, execmd, repeat)
report(i, curr_lines, object_file, orig_fom, curr_fom, csv=csv)
i *= 2 # make growth function configurable?
os.remove(cppfile)
os.rename(backup_file, cppfile)
###############################################################################
def _main_func(description):
###############################################################################
if ("--test" in sys.argv):
test_results = doctest.testmod(verbose=True)
sys.exit(1 if test_results.failed > 0 else 0)
cppfile, buildcmd, execmd, start, end, repeat, template, csv = parse_command_line(sys.argv, description)
obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv)
###############################################################################
if (__name__ == "__main__"):
_main_func(__doc__)

View File

@ -0,0 +1,10 @@
TRIBITS_SUBPACKAGE(Containers)
ADD_SUBDIRECTORY(src)
TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
TRIBITS_SUBPACKAGE_POSTPROCESS()

View File

@ -0,0 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -0,0 +1,4 @@
#ifndef KOKKOS_CONTAINERS_CONFIG_H
#define KOKKOS_CONTAINERS_CONFIG_H
#endif

View File

@ -0,0 +1,26 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES
TestMain.cpp
TestCuda.cpp
)
IF(Kokkos_ENABLE_Pthread)
LIST( APPEND SOURCES TestThreads.cpp)
ENDIF()
IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES TestOpenMP.cpp)
ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerformanceTest
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)

View File

@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
default: build_all
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
@ -50,7 +50,7 @@ KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
@ -63,11 +63,11 @@ test-threads: KokkosContainers_PerformanceTest_Threads
test-openmp: KokkosContainers_PerformanceTest_OpenMP
./KokkosContainers_PerformanceTest_OpenMP
build_all: $(TARGETS)
test: $(TEST_TARGETS)
clean: kokkos-clean
rm -f *.o $(TARGETS)

View File

@ -0,0 +1,31 @@
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
#-----------------------------------------------------------------------------
SET(HEADERS "")
SET(SOURCES "")
SET(HEADERS_IMPL "")
FILE(GLOB HEADERS *.hpp)
FILE(GLOB HEADERS_IMPL impl/*.hpp)
FILE(GLOB SOURCES impl/*.cpp)
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
TRIBITS_ADD_LIBRARY(
kokkoscontainers
HEADERS ${HEADERS}
NOINSTALLHEADERS ${HEADERS_IMPL}
SOURCES ${SOURCES}
DEPLIBS
)
#-----------------------------------------------------------------------------

View File

@ -90,7 +90,7 @@ public:
private:
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
enum { block_mask = block_size-1u };
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
public:
@ -322,7 +322,7 @@ public:
private:
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
enum { block_mask = block_size -1u };
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
public:
ConstBitset()

View File

@ -106,9 +106,9 @@ public:
//! The type of a Kokkos::View on the device.
typedef View< typename traits::data_type ,
typename traits::array_layout ,
typename traits::device_type ,
typename traits::memory_traits > t_dev ;
Arg1Type ,
Arg2Type ,
Arg3Type > t_dev ;
/// \typedef t_host
/// \brief The type of a Kokkos::View host mirror of \c t_dev.
@ -117,9 +117,9 @@ public:
//! The type of a const View on the device.
//! The type of a Kokkos::View on the device.
typedef View< typename traits::const_data_type ,
typename traits::array_layout ,
typename traits::device_type ,
typename traits::memory_traits > t_dev_const ;
Arg1Type ,
Arg2Type ,
Arg3Type > t_dev_const ;
/// \typedef t_host_const
/// \brief The type of a const View host mirror of \c t_dev_const.
@ -221,6 +221,19 @@ public:
modified_host (src.modified_host)
{}
//! Subview constructor
template< class SD, class S1 , class S2 , class S3
, class Arg0 , class ... Args >
DualView( const DualView<SD,S1,S2,S3> & src
, const Arg0 & arg0
, Args ... args
)
: d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
, h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
, modified_device (src.modified_device)
, modified_host (src.modified_host)
{}
/// \brief Create DualView from existing device and host View objects.
///
/// This constructor assumes that the device and host View objects
@ -237,7 +250,30 @@ public:
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
{
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
#else
if ( d_view.rank != h_view.rank ||
d_view.dimension_0() != h_view.dimension_0() ||
d_view.dimension_1() != h_view.dimension_1() ||
d_view.dimension_2() != h_view.dimension_2() ||
d_view.dimension_3() != h_view.dimension_3() ||
d_view.dimension_4() != h_view.dimension_4() ||
d_view.dimension_5() != h_view.dimension_5() ||
d_view.dimension_6() != h_view.dimension_6() ||
d_view.dimension_7() != h_view.dimension_7() ||
d_view.stride_0() != h_view.stride_0() ||
d_view.stride_1() != h_view.stride_1() ||
d_view.stride_2() != h_view.stride_2() ||
d_view.stride_3() != h_view.stride_3() ||
d_view.stride_4() != h_view.stride_4() ||
d_view.stride_5() != h_view.stride_5() ||
d_view.stride_6() != h_view.stride_6() ||
d_view.stride_7() != h_view.stride_7() ||
d_view.span() != h_view.span() ) {
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
}
#endif
}
//@}
@ -501,6 +537,52 @@ public:
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//
// Partial specializations of Kokkos::subview() for DualView objects.
//
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace Kokkos {
namespace Impl {
template< class D, class A1, class A2, class A3, class ... Args >
struct DualViewSubview {
typedef typename Kokkos::Experimental::Impl::ViewMapping
< void
, Kokkos::ViewTraits< D, A1, A2, A3 >
, Args ...
>::traits_type dst_traits ;
typedef Kokkos::DualView
< typename dst_traits::data_type
, typename dst_traits::array_layout
, typename dst_traits::device_type
, typename dst_traits::memory_traits
> type ;
};
} /* namespace Impl */
template< class D , class A1 , class A2 , class A3 , class ... Args >
typename Impl::DualViewSubview<D,A1,A2,A3,Args...>::type
subview( const DualView<D,A1,A2,A3> & src , Args ... args )
{
return typename
Impl::DualViewSubview<D,A1,A2,A3,Args...>::type( src , args ... );
}
} /* namespace Kokkos */
#else
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//
// Partial specializations of Kokkos::subview() for DualView objects.
//
@ -839,6 +921,15 @@ subview( const DualView<D,A1,A2,A3> & src ,
return sub_view;
}
} // namespace Kokkos
#endif /* defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//
// Partial specialization of Kokkos::deep_copy() for DualView objects.
//

View File

@ -53,12 +53,8 @@
*/
namespace Kokkos {
template <typename Scalar, class Space = Kokkos::DefaultExecutionSpace >
class vector : public DualView<Scalar*,LayoutLeft,Space> {
public:
typedef typename Space::memory_space memory_space;
typedef typename Space::execution_space execution_space;
typedef typename Kokkos::Device<execution_space,memory_space> device_type;
template< class Scalar, class Arg1Type = void>
class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
typedef Scalar value_type;
typedef Scalar* pointer;
@ -72,7 +68,7 @@ private:
size_t _size;
typedef size_t size_type;
float _extra_storage;
typedef DualView<Scalar*,LayoutLeft,Space> DV;
typedef DualView<Scalar*,LayoutLeft,Arg1Type> DV;
public:
@ -93,7 +89,7 @@ public:
};
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Space>("Vector",size_t(n*(1.1))) {
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
_size = n;
_extra_storage = 1.1;
DV::modified_host() = 1;

View File

@ -0,0 +1,40 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES
UnitTestMain.cpp
TestCuda.cpp
)
SET(LIBRARIES kokkoscore)
IF(Kokkos_ENABLE_Pthread)
LIST( APPEND SOURCES
TestThreads.cpp
)
ENDIF()
IF(Kokkos_ENABLE_Serial)
LIST( APPEND SOURCES
TestSerial.cpp
)
ENDIF()
IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES
TestOpenMP.cpp
)
ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)

View File

@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
default: build_all
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
@ -56,7 +56,7 @@ KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Threads
KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_OpenMP
@ -74,11 +74,11 @@ test-openmp: KokkosContainers_UnitTest_OpenMP
test-serial: KokkosContainers_UnitTest_Serial
./KokkosContainers_UnitTest_Serial
build_all: $(TARGETS)
test: $(TEST_TARGETS)
clean: kokkos-clean
rm -f *.o $(TARGETS)

View File

@ -1,12 +1,12 @@
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -35,7 +35,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
@ -43,7 +43,7 @@
#ifndef KOKKOS_TEST_COMPLEX_HPP
#define KOKKOS_TEST_COMPLEX_HPP
//#include <Kokkos_Complex.hpp>
#include <Kokkos_Complex.hpp>
#include <gtest/gtest.h>
#include <iostream>
@ -124,14 +124,13 @@ namespace Impl {
complex_type z1 (1.0, -1.0);
complex_type z2 (-1.0, 1.0);
complex_type z3 = z1 - z2;
ASSERT_TRUE( z3 == complex_type (2.0, -2.0) );
complex_type z3 = z1 * z2;
ASSERT_TRUE( z3 == complex_type (0.0, 2.0) );
// Test unary minus.
complex_type z4 (3.0, -4.0);
ASSERT_TRUE( z4 == complex_type (3.0, -4.0) );
ASSERT_TRUE( -z4 == complex_type (-3.0, 4.0) );
ASSERT_TRUE( z4 == -complex_type (-3.0, 4.0) );
// Make sure that std::complex * Kokkos::complex works too.
std::complex<RealType> z4 (-1.0, 1.0);
complex_type z5 = z4 * z1;
ASSERT_TRUE( z5 == complex_type (0.0, 2.0) );
}
template <typename RealType>
@ -208,7 +207,7 @@ namespace Impl {
typedef Kokkos::View<const Kokkos::complex<RealType>*, Device> view_type;
typedef typename view_type::size_type size_type;
typedef Kokkos::complex<RealType> value_type;
typedef Kokkos::complex<RealType> value_type;
KOKKOS_INLINE_FUNCTION
void operator () (const size_type i, Kokkos::complex<RealType>& sum) const {

View File

@ -0,0 +1,11 @@
TRIBITS_SUBPACKAGE(Core)
ADD_SUBDIRECTORY(src)
TRIBITS_ADD_TEST_DIRECTORIES(unit_test)
TRIBITS_ADD_TEST_DIRECTORIES(perf_test)
TRIBITS_SUBPACKAGE_POSTPROCESS()

View File

@ -0,0 +1,4 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -0,0 +1,50 @@
#ifndef KOKKOS_CORE_CONFIG_H
#define KOKKOS_CORE_CONFIG_H
/* The trivial 'src/build_common.sh' creates a config
* that must stay in sync with this file.
*/
#cmakedefine KOKKOS_FOR_SIERRA
#if !defined( KOKKOS_FOR_SIERRA )
#cmakedefine KOKKOS_HAVE_MPI
#cmakedefine KOKKOS_HAVE_CUDA
// mfh 16 Sep 2014: If passed in on the command line, that overrides
// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build
// warnings like this one:
//
// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
//
// At some point, we should edit the test-build scripts in
// Trilinos/cmake/ctest/drivers/perseus/, and take
// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there. I
// hesitate to do that now, because I'm not sure if all the files are
// including KokkosCore_config.h (or a header file that includes it) like
// they should.
#if ! defined(KOKKOS_USE_CUDA_UVM)
#cmakedefine KOKKOS_USE_CUDA_UVM
#endif // ! defined(KOKKOS_USE_CUDA_UVM)
#cmakedefine KOKKOS_HAVE_PTHREAD
#cmakedefine KOKKOS_HAVE_SERIAL
#cmakedefine KOKKOS_HAVE_QTHREAD
#cmakedefine KOKKOS_HAVE_Winthread
#cmakedefine KOKKOS_HAVE_OPENMP
#cmakedefine KOKKOS_HAVE_HWLOC
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
#cmakedefine KOKKOS_HAVE_CXX11
#cmakedefine KOKKOS_HAVE_CUSPARSE
#cmakedefine KOKKOS_ENABLE_PROFILING_COLLECT_KERNEL_DATA
#cmakedefine KOKKOS_ENABLE_PROFILING_AGGREGATE_MPI
// Don't forbid users from defining this macro on the command line,
// but still make sure that CMake logic can control its definition.
#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
#cmakedefine KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
#endif // KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
#endif // KOKKOS_FOR_SIERRA
#endif // KOKKOS_CORE_CONFIG_H

View File

@ -0,0 +1,18 @@
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
SET(SOURCES
PerfTestMain.cpp
PerfTestHost.cpp
PerfTestCuda.cpp
)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerfTest
SOURCES ${SOURCES}
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)

View File

@ -1,17 +1,17 @@
KOKKOS_PATH = ../..
GTEST_PATH = ../../TPL/gtest
GTEST_PATH = ../../tpls/gtest
vpath %.cpp ${KOKKOS_PATH}/core/perf_test
default: build_all
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = $(CXX)
LDFLAGS ?= -lpthread
@ -47,12 +47,12 @@ test-performance: KokkosCore_PerformanceTest
test-atomic: KokkosCore_PerformanceTest_Atomics
./KokkosCore_PerformanceTest_Atomics
build_all: $(TARGETS)
test: $(TEST_TARGETS)
clean: kokkos-clean
rm -f *.o $(TARGETS)

View File

@ -174,7 +174,7 @@ struct TextureFetch
TEST_F( cuda, texture_double )
{
printf("Random reduce of double through texture fetch\n");
for (int i=1; i<=27; ++i) {
for (int i=1; i<=26; ++i) {
int size = 1<<i;
double time = 0;
double reduce = 0;

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -167,7 +167,7 @@ T AddLoopSerial(int loop) {
*data+=(T)1;
T val = *data;
delete data;
delete [] data;
return val;
}
@ -272,7 +272,7 @@ T CASLoopSerial(int loop) {
}
T val = *data;
delete data;
delete [] data;
return val;
}
@ -373,8 +373,8 @@ T ExchLoopSerial(int loop) {
}
T val = *data2 + *data;
delete data;
delete data2;
delete [] data;
delete [] data2;
return val;
}

View File

@ -0,0 +1,113 @@
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Serial
KOKKOS_HAVE_SERIAL
"Whether to enable the Kokkos::Serial device. This device executes \"parallel\" kernels sequentially on a single CPU thread. It is enabled by default. If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads."
ON
)
ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
# use of lambdas at the outer level of parallel dispatch (that is, as
# the argument to an outer parallel_for, parallel_reduce, or
# parallel_scan). This works with non-CUDA execution spaces if C++11
# is enabled. It does not currently work with public releases of
# CUDA. If that changes, please change the default here to ON if CUDA
# and C++11 are ON.
IF (${PROJECT_NAME}_ENABLE_CXX11)
IF (${PACKAGE_NAME}_ENABLE_CUDA)
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
ELSE ()
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
ENDIF ()
ELSE ()
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
ENDIF ()
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
"Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
)
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
#-----------------------------------------------------------------------------
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
#-----------------------------------------------------------------------------
SET(HEADERS_PUBLIC "")
SET(HEADERS_PRIVATE "")
SET(SOURCES "")
FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp)
LIST( APPEND HEADERS_PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h )
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_IMPL impl/*.hpp)
FILE(GLOB SOURCES_IMPL impl/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} )
LIST(APPEND SOURCES ${SOURCES_IMPL} )
INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_THREADS Threads/*.hpp)
FILE(GLOB SOURCES_THREADS Threads/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} )
LIST(APPEND SOURCES ${SOURCES_THREADS} )
INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp)
FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} )
LIST(APPEND SOURCES ${SOURCES_OPENMP} )
INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
FILE(GLOB SOURCES_CUDA Cuda/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} )
LIST(APPEND SOURCES ${SOURCES_CUDA} )
INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
LIST(APPEND SOURCES ${SOURCES_QTHREAD} )
INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
#-----------------------------------------------------------------------------
TRIBITS_ADD_LIBRARY(
kokkoscore
HEADERS ${HEADERS_PUBLIC}
NOINSTALLHEADERS ${HEADERS_PRIVATE}
SOURCES ${SOURCES}
DEPLIBS
)

View File

@ -54,7 +54,59 @@ namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > {
KOKKOS_INLINE_FUNCTION
static void apply( const size_t rank
, const size_t n0 , const size_t n1
, const size_t n2 , const size_t n3
, const size_t n4 , const size_t n5
, const size_t n6 , const size_t n7
, const size_t i0 , const size_t i1
, const size_t i2 , const size_t i3
, const size_t i4 , const size_t i5
, const size_t i6 , const size_t i7 )
{
const int r =
( n0 <= i0 ? 0 :
( n1 <= i1 ? 1 :
( n2 <= i2 ? 2 :
( n3 <= i3 ? 3 :
( n4 <= i4 ? 4 :
( n5 <= i5 ? 5 :
( n6 <= i6 ? 6 : 7 )))))));
const size_t n =
( n0 <= i0 ? n0 :
( n1 <= i1 ? n1 :
( n2 <= i2 ? n2 :
( n3 <= i3 ? n3 :
( n4 <= i4 ? n4 :
( n5 <= i5 ? n5 :
( n6 <= i6 ? n6 : n7 )))))));
const size_t i =
( n0 <= i0 ? i0 :
( n1 <= i1 ? i1 :
( n2 <= i2 ? i2 :
( n3 <= i3 ? i3 :
( n4 <= i4 ? i4 :
( n5 <= i5 ? i5 :
( n6 <= i6 ? i6 : i7 )))))));
printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n );
Kokkos::Impl::cuda_abort("Cuda view array bounds error");
}
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
@ -130,7 +182,6 @@ struct CudaTextureFetch {
CudaTextureFetch( const ValueType * const arg_ptr
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
)
// 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300
: m_obj( record.template attach_texture_object< AliasType >() )
, m_ptr( arg_ptr )
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )

View File

@ -208,9 +208,9 @@ struct CudaParallelLaunch< DriverType , true > {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
}
// Copy functor to constant memory on the device
@ -246,9 +246,9 @@ struct CudaParallelLaunch< DriverType , false > {
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
}
else if ( shmem ) {
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
} else {
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 );
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
}
int* lock_array_ptr = lock_array_cuda_space_ptr();

View File

@ -45,6 +45,7 @@
#include <iostream>
#include <sstream>
#include <stdexcept>
#include <algorithm>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
@ -106,6 +107,8 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
namespace Kokkos {
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace {
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
@ -164,6 +167,8 @@ void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
void CudaSpace::access_error()
{
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@ -178,6 +183,8 @@ void CudaSpace::access_error( const void * const )
/*--------------------------------------------------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
@ -191,6 +198,8 @@ void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & track
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
bool CudaUVMSpace::available()
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@ -203,11 +212,15 @@ bool CudaUVMSpace::available()
/*--------------------------------------------------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
@ -301,8 +314,18 @@ attach_texture_object( const unsigned sizeof_alias
, void * const alloc_ptr
, size_t const alloc_size )
{
// Only valid for 300 <= __CUDA_ARCH__
// otherwise return zero.
enum { TEXTURE_BOUND_1D = 1u << 27 };
if ( ( alloc_ptr == 0 ) || ( sizeof_alias * TEXTURE_BOUND_1D <= alloc_size ) ) {
std::ostringstream msg ;
msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
<< " alloc_ptr(" << alloc_ptr << ")"
<< " alloc_size(" << alloc_size << ")"
<< " max_size(" << ( sizeof_alias * TEXTURE_BOUND_1D ) << ")" ;
std::cerr << msg.str() << std::endl ;
std::cerr.flush();
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
::cudaTextureObject_t tex_obj ;
@ -505,6 +528,133 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
);
}
//----------------------------------------------------------------------------
void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
allocate_tracked( const Kokkos::CudaSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::CudaSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
Kokkos::Impl::DeepCopy<CudaSpace,CudaSpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
Kokkos::Impl::DeepCopy<CudaUVMSpace,CudaUVMSpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
if ( ! arg_alloc_size ) return (void *) 0 ;
SharedAllocationRecord * const r =
allocate( arg_space , arg_alloc_label , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
void SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
deallocate_tracked( void * const arg_alloc_ptr )
{
if ( arg_alloc_ptr != 0 ) {
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
RecordBase::decrement( r );
}
}
void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size )
{
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
Kokkos::Impl::DeepCopy<CudaHostPinnedSpace,CudaHostPinnedSpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
//----------------------------------------------------------------------------
SharedAllocationRecord< Kokkos::CudaSpace , void > *
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
{
@ -514,15 +664,17 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
#if 0
// Copy the header from the allocation
SharedAllocationHeader head ;
Header head ;
SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr );
Header const * const head_cuda = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
if ( alloc_ptr ) {
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
}
RecordCuda * const record = static_cast< RecordCuda * >( head.m_record );
RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
if ( record->m_alloc_ptr != head_cuda ) {
if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
}
@ -548,9 +700,9 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_
using Header = SharedAllocationHeader ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
if ( h->m_record->m_alloc_ptr != h ) {
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
}
@ -563,9 +715,9 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
using Header = SharedAllocationHeader ;
using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
if ( h->m_record->m_alloc_ptr != h ) {
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
}
@ -592,14 +744,25 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
head.m_label[0] = 0 ;
}
snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
, reinterpret_cast<unsigned long>( r )
, reinterpret_cast<unsigned long>( r->m_prev )
, reinterpret_cast<unsigned long>( r->m_next )
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
}
snprintf( buffer , 256
, format_string
, reinterpret_cast<uintptr_t>( r )
, reinterpret_cast<uintptr_t>( r->m_prev )
, reinterpret_cast<uintptr_t>( r->m_next )
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
, r->m_alloc_size
, r->m_count
, reinterpret_cast<unsigned long>( r->m_dealloc )
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, head.m_label
);
std::cout << buffer ;
@ -612,8 +775,19 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n"
, reinterpret_cast< unsigned long >( r->data() )
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
}
snprintf( buffer , 256
, format_string
, reinterpret_cast< uintptr_t >( r->data() )
, r->size()
, head.m_label
);

View File

@ -71,7 +71,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
DestructFunctor * const functor =
reinterpret_cast< DestructFunctor * >(
reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) );
reinterpret_cast< uintptr_t >( record ) + sizeof(SharedAllocationRecord) );
new( functor ) DestructFunctor( arg_destruct );

View File

@ -43,6 +43,8 @@
#include <Kokkos_Macros.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
@ -56,6 +58,7 @@ namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
TextureAttribute::TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
@ -190,3 +193,6 @@ void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */

View File

@ -46,6 +46,8 @@
#include <Kokkos_Macros.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
@ -85,7 +87,6 @@ struct TextureAttribute : public AllocatorAttributeBase
~TextureAttribute();
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedAllocator
@ -184,4 +185,6 @@ public:
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP

View File

@ -222,10 +222,14 @@ private:
CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & );
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
AllocationTracker m_scratchFlagsTracker;
AllocationTracker m_scratchSpaceTracker;
AllocationTracker m_scratchUnifiedTracker;
#endif
public:
@ -482,6 +486,32 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
#ifdef KOKKOS_CUDA_USE_UVM
if(!cuda_launch_blocking()) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
std::cout << " The code must call Cuda::fence() after each kernel" << std::endl;
std::cout << " or will likely crash when accessing data on the host." << std::endl;
}
const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
bool force_device_alloc;
if (env_force_device_alloc == 0) force_device_alloc=false;
else force_device_alloc=atoi(env_force_device_alloc)!=0;
const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
bool visible_devices_one=true;
if (env_visible_devices == 0) visible_devices_one=false;
if(!visible_devices_one && !force_device_alloc) {
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl;
std::cout << " This could on multi GPU systems lead to severe performance" << std::endl;
std::cout << " penalties." << std::endl;
}
#endif
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_cuda_space();
@ -501,9 +531,27 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchFlags"
, ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
Record::increment( r );
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
#endif
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
}
@ -517,9 +565,26 @@ CudaInternal::scratch_space( const Cuda::size_type size )
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace()
, "InternalScratchSpace"
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
Record::increment( r );
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
#endif
}
return m_scratchSpace ;
@ -533,8 +598,26 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
, "InternalScratchUnified"
, ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
Record::increment( r );
m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
#endif
}
return m_scratchUnified ;
@ -555,10 +638,23 @@ void CudaInternal::finalize()
::free( m_stream );
}
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
m_scratchSpaceTracker.clear();
m_scratchFlagsTracker.clear();
m_scratchUnifiedTracker.clear();
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
#endif
m_cudaDev = -1 ;
m_maxWarpCount = 0 ;
m_maxBlock = 0 ;

View File

@ -43,7 +43,7 @@
#ifndef KOKKOS_CUDA_INTERNAL_HPP
#define KOKKOS_CUDA_INTERNAL_HPP
#include<iostream>
#include <Kokkos_Macros.hpp>
/* only compile this file if CUDA is enabled for Kokkos */
@ -53,18 +53,21 @@
namespace Kokkos { namespace Impl {
template<class DriverType, bool Large>
struct CudaGetMaxBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
}
template<class DriverType>
int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
int numBlocks;
if(Large) {
struct CudaGetMaxBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
int numBlocks;
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
@ -73,7 +76,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -83,9 +86,16 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
}
if(numBlocks>0) return blockSize;
else return blockSize/2;
} else {
}
};
template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
int numBlocks;
int blockSize=32;
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_local_memory<DriverType>,
@ -94,7 +104,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
while (blockSize<1024 && numBlocks>0) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -105,42 +115,58 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
if(numBlocks>0) return blockSize;
else return blockSize/2;
}
#endif
};
template<class DriverType, bool Large>
struct CudaGetOptBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
}
template<class DriverType>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
#if ( CUDA_VERSION < 6050 )
return 256;
#else
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
struct CudaGetOptBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
if(Large) {
while(blockSize<1024) {
blockSize*=2;
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
cuda_parallel_launch_constant_memory<DriverType>,
blockSize,
sharedmem);
if(maxOccupancy < numBlocks*blockSize) {
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
maxOccupancy = numBlocks*blockSize;
bestBlockSize = blockSize;
}
}
} else {
return bestBlockSize;
}
};
template<class DriverType>
struct CudaGetOptBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
int blockSize=16;
int numBlocks;
int sharedmem;
int maxOccupancy=0;
int bestBlockSize=0;
while(blockSize<1024) {
blockSize*=2;
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks,
@ -153,10 +179,9 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
bestBlockSize = blockSize;
}
}
return bestBlockSize;
}
return bestBlockSize;
#endif
}
};
}} // namespace Kokkos::Impl

File diff suppressed because it is too large Load Diff

View File

@ -117,7 +117,7 @@ inline void cuda_inter_warp_reduction( ValueType& value,
value = result[0];
for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
join(value,result[i]);
}
@ -345,8 +345,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
// '__ffs' = position of the least significant bit set to 1.
// 'blockDim.y' is guaranteed to be a power of two so this
// is the integral shift value that can replace an integral divide.
const unsigned BlockSizeShift = __ffs( blockDim.y ) - 1 ;
const unsigned BlockSizeMask = blockDim.y - 1 ;
const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
// Must have power of two thread count
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }

View File

@ -53,6 +53,7 @@
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_CudaSpace.hpp>
#include <impl/Kokkos_Shape.hpp>
#include <Kokkos_View.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
@ -89,6 +90,8 @@ struct AssertShapeBoundsAbort< CudaSpace >
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace Kokkos {
namespace Impl {
@ -419,6 +422,8 @@ public:
}
}
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -45,6 +45,7 @@
#define KOKKOS_ARRAY
#include <type_traits>
#include <algorithm>
#include <limits>
namespace Kokkos {

View File

@ -0,0 +1,529 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_COMPLEX_HPP
#define KOKKOS_COMPLEX_HPP
#include <Kokkos_Atomic.hpp>
#include <complex>
#include <iostream>
namespace Kokkos {
/// \class complex
/// \brief Partial reimplementation of std::complex that works as the
/// result of a Kokkos::parallel_reduce.
/// \tparam RealType The type of the real and imaginary parts of the
/// complex number. As with std::complex, this is only defined for
/// \c float, \c double, and <tt>long double</tt>. The latter is
/// currently forbidden in CUDA device kernels.
template<class RealType>
class complex {
private:
RealType re_, im_;
public:
//! The type of the real or imaginary parts of this complex number.
typedef RealType value_type;
//! Default constructor (initializes both real and imaginary parts to zero).
KOKKOS_INLINE_FUNCTION complex () :
re_ (0.0), im_ (0.0)
{}
//! Copy constructor.
KOKKOS_INLINE_FUNCTION complex (const complex<RealType>& src) :
re_ (src.re_), im_ (src.im_)
{}
//! Copy constructor from volatile.
KOKKOS_INLINE_FUNCTION complex (const volatile complex<RealType>& src) :
re_ (src.re_), im_ (src.im_)
{}
/// \brief Conversion constructor from std::complex.
///
/// This constructor cannot be called in a CUDA device function,
/// because std::complex's methods and nonmember functions are not
/// marked as CUDA device functions.
template<class InputRealType>
complex (const std::complex<InputRealType>& src) :
re_ (std::real (src)), im_ (std::imag (src))
{}
/// \brief Conversion operator to std::complex.
///
/// This operator cannot be called in a CUDA device function,
/// because std::complex's methods and nonmember functions are not
/// marked as CUDA device functions.
operator std::complex<RealType> () const {
return std::complex<RealType> (re_, im_);
}
/// \brief Constructor that takes just the real part, and sets the
/// imaginary part to zero.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION complex (const InputRealType& val) :
re_ (val), im_ (0.0)
{}
//! Constructor that takes the real and imaginary parts.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
re_ (re), im_ (im)
{}
//! Assignment operator.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator= (const complex<InputRealType>& src) {
re_ = src.re_;
im_ = src.im_;
return *this;
}
//! Assignment operator.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
re_ = src.re_;
im_ = src.im_;
return *this;
}
//! Assignment operator.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
volatile complex<RealType>& operator= (const volatile complex<InputRealType>& src) volatile {
re_ = src.re_;
im_ = src.im_;
return *this;
}
//! Assignment operator.
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator= (const volatile complex<InputRealType>& src) {
re_ = src.re_;
im_ = src.im_;
return *this;
}
//! Assignment operator (from a real number).
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator= (const InputRealType& val) {
re_ = val;
im_ = static_cast<RealType> (0.0);
return *this;
}
//! Assignment operator (from a real number).
template<class InputRealType>
KOKKOS_INLINE_FUNCTION
void operator= (const InputRealType& val) volatile {
re_ = val;
im_ = static_cast<RealType> (0.0);
}
/// \brief Assignment operator from std::complex.
///
/// This constructor cannot be called in a CUDA device function,
/// because std::complex's methods and nonmember functions are not
/// marked as CUDA device functions.
template<class InputRealType>
complex<RealType>& operator= (const std::complex<InputRealType>& src) {
re_ = std::real (src);
im_ = std::imag (src);
return *this;
}
//! The imaginary part of this complex number.
KOKKOS_INLINE_FUNCTION RealType& imag () {
return im_;
}
//! The real part of this complex number.
KOKKOS_INLINE_FUNCTION RealType& real () {
return re_;
}
//! The imaginary part of this complex number.
KOKKOS_INLINE_FUNCTION const RealType imag () const {
return im_;
}
//! The real part of this complex number.
KOKKOS_INLINE_FUNCTION const RealType real () const {
return re_;
}
//! The imaginary part of this complex number (volatile overload).
KOKKOS_INLINE_FUNCTION volatile RealType& imag () volatile {
return im_;
}
//! The real part of this complex number (volatile overload).
KOKKOS_INLINE_FUNCTION volatile RealType& real () volatile {
return re_;
}
//! The imaginary part of this complex number (volatile overload).
KOKKOS_INLINE_FUNCTION const RealType imag () const volatile {
return im_;
}
//! The real part of this complex number (volatile overload).
KOKKOS_INLINE_FUNCTION const RealType real () const volatile {
return re_;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const complex<RealType>& src) {
re_ += src.re_;
im_ += src.im_;
return *this;
}
KOKKOS_INLINE_FUNCTION
void operator += (const volatile complex<RealType>& src) volatile {
re_ += src.re_;
im_ += src.im_;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator += (const RealType& src) {
re_ += src;
return *this;
}
KOKKOS_INLINE_FUNCTION
void operator += (const volatile RealType& src) volatile {
re_ += src;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const complex<RealType>& src) {
re_ -= src.re_;
im_ -= src.im_;
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator -= (const RealType& src) {
re_ -= src;
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const complex<RealType>& src) {
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
im_ = imagPart;
return *this;
}
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile complex<RealType>& src) volatile {
const RealType realPart = re_ * src.re_ - im_ * src.im_;
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
re_ = realPart;
im_ = imagPart;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator *= (const RealType& src) {
re_ *= src;
im_ *= src;
return *this;
}
KOKKOS_INLINE_FUNCTION
void operator *= (const volatile RealType& src) volatile {
re_ *= src;
im_ *= src;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const complex<RealType>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
this->re_ /= s;
this->im_ /= s;
}
else {
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
*this = x_scaled * y_conj_scaled;
*this /= y_scaled_abs;
}
return *this;
}
KOKKOS_INLINE_FUNCTION
complex<RealType>& operator /= (const RealType& src) {
re_ /= src;
im_ /= src;
return *this;
}
};
//! Binary + operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
}
//! Unary + operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator + (const complex<RealType>& x) {
return x;
}
//! Binary - operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
}
//! Unary - operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator - (const complex<RealType>& x) {
return complex<RealType> (-x.real (), -x.imag ());
}
//! Binary * operator for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
/// \brief Binary * operator for std::complex and complex.
///
/// This function exists because GCC 4.7.2 (and perhaps other
/// compilers) are not able to deduce that they can multiply
/// std::complex by Kokkos::complex, by first converting std::complex
/// to Kokkos::complex.
///
/// This function cannot be called in a CUDA device function, because
/// std::complex's methods and nonmember functions are not marked as
/// CUDA device functions.
template<class RealType>
complex<RealType>
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
x.real () * y.imag () + x.imag () * y.real ());
}
/// \brief Binary * operator for RealType times complex.
///
/// This function exists because the compiler doesn't know that
/// RealType and complex<RealType> commute with respect to operator*.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator * (const RealType& x, const complex<RealType>& y) {
return complex<RealType> (x * y.real (), x * y.imag ());
}
//! Imaginary part of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
RealType imag (const complex<RealType>& x) {
return x.imag ();
}
//! Real part of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
RealType real (const complex<RealType>& x) {
return x.real ();
}
//! Absolute value (magnitude) of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
RealType abs (const complex<RealType>& x) {
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
}
//! Conjugate of a complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType> conj (const complex<RealType>& x) {
return complex<RealType> (real (x), -imag (x));
}
//! Binary operator / for complex and real numbers
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
complex<RealType1>
operator / (const complex<RealType1>& x, const RealType2& y) {
return complex<RealType1> (real (x) / y, imag (x) / y);
}
//! Binary operator / for complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
complex<RealType>
operator / (const complex<RealType>& x, const complex<RealType>& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
// because y/s is NaN.
if (s == 0.0) {
return complex<RealType> (real (x) / s, imag (x) / s);
}
else {
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
complex<RealType> result = x_scaled * y_conj_scaled;
result /= y_scaled_abs;
return result;
}
}
//! Equality operator for two complex numbers.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) == real (y) && imag (x) == imag (y);
}
//! Equality operator for std::complex and Kokkos::complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) == real (y) && std::imag (x) == imag (y);
}
//! Equality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator == (const complex<RealType1>& x, const RealType2& y) {
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
}
//! Equality operator for real and complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator == (const RealType& x, const complex<RealType>& y) {
return y == x;
}
//! Inequality operator for two complex numbers.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
return real (x) != real (y) || imag (x) != imag (y);
}
//! Inequality operator for std::complex and Kokkos::complex.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
return std::real (x) != real (y) || std::imag (x) != imag (y);
}
//! Inequality operator for complex and real number.
template<class RealType1, class RealType2>
KOKKOS_INLINE_FUNCTION
bool operator != (const complex<RealType1>& x, const RealType2& y) {
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
}
//! Inequality operator for real and complex number.
template<class RealType>
KOKKOS_INLINE_FUNCTION
bool operator != (const RealType& x, const complex<RealType>& y) {
return y != x;
}
template<class RealType>
std::ostream& operator << (std::ostream& os, const complex<RealType>& x) {
const std::complex<RealType> x_std (Kokkos::real (x), Kokkos::imag (x));
os << x_std;
return os;
}
template<class RealType>
std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
std::complex<RealType> x_std;
os >> x_std;
x = x_std; // only assigns on success of above
return os;
}
} // namespace Kokkos
#endif // KOKKOS_COMPLEX_HPP

View File

@ -49,22 +49,22 @@
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_CUDA )
#include <Kokkos_Cuda.hpp>
#if defined( KOKKOS_HAVE_SERIAL )
#include <Kokkos_Serial.hpp>
#endif
#if defined( KOKKOS_HAVE_OPENMP )
#include <Kokkos_OpenMP.hpp>
#endif
#if defined( KOKKOS_HAVE_SERIAL )
#include <Kokkos_Serial.hpp>
#endif
#if defined( KOKKOS_HAVE_PTHREAD )
#include <Kokkos_Threads.hpp>
#endif
#if defined( KOKKOS_HAVE_CUDA )
#include <Kokkos_Cuda.hpp>
#endif
#include <Kokkos_Pair.hpp>
#include <Kokkos_Array.hpp>
#include <Kokkos_View.hpp>
@ -72,10 +72,8 @@
#include <Kokkos_Atomic.hpp>
#include <Kokkos_hwloc.hpp>
#include <iostream>
#ifdef KOKKOS_HAVE_CXX11
////#include <Kokkos_Complex.hpp>
#include <Kokkos_Complex.hpp>
#endif
@ -107,9 +105,70 @@ void finalize_all();
void fence();
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
/* Allocate memory from a memory space.
* The allocation is tracked in Kokkos memory tracking system, so
* leaked memory can be identified.
*/
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_malloc( const std::string & arg_alloc_label
, const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
return Impl::SharedAllocationRecord< MemorySpace >::
allocate_tracked( MemorySpace() , arg_alloc_label , arg_alloc_size );
}
#ifdef KOKKOS_HAVE_CXX11
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_malloc( const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
return Impl::SharedAllocationRecord< MemorySpace >::
allocate_tracked( MemorySpace() , "no-label" , arg_alloc_size );
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void kokkos_free( void * arg_alloc )
{
typedef typename Space::memory_space MemorySpace ;
return Impl::SharedAllocationRecord< MemorySpace >::
deallocate_tracked( arg_alloc );
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
return Impl::SharedAllocationRecord< MemorySpace >::
reallocate_tracked( arg_alloc , arg_alloc_size );
}
} // namespace Experimental
} // namespace Kokkos
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace Kokkos {
using Kokkos::Experimental::kokkos_malloc ;
using Kokkos::Experimental::kokkos_realloc ;
using Kokkos::Experimental::kokkos_free ;
}
#else
namespace Kokkos {
namespace Impl {
@ -161,7 +220,10 @@ void kokkos_free(const void* ptr) {
template< class Arg = DefaultExecutionSpace>
const void* kokkos_realloc(const void* old_ptr, size_t size) {
void* kokkos_realloc(const void* old_ptr, size_t size) {
if(old_ptr == NULL)
return kokkos_malloc<Arg>(size);
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
@ -172,64 +234,11 @@ const void* kokkos_realloc(const void* old_ptr, size_t size) {
}
} // namespace Kokkos
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_malloc( const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
RecordBase::increment( r );
return r->data();
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void kokkos_free( void * arg_alloc )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r = RecordHost::get_record( arg_alloc );
RecordBase::decrement( r );
}
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
inline
void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
{
typedef typename Space::memory_space MemorySpace ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
RecordHost * const r_old = RecordHost::get_record( arg_alloc );
RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data()
, std::min( r_old->size() , r_new->size() ) );
RecordBase::increment( r_new );
RecordBase::decrement( r_old );
return r_new->data();
}
} // namespace Experimental
} // namespace Kokkos
#endif

View File

@ -50,6 +50,22 @@
#include <Kokkos_Macros.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
struct AUTO_t {
KOKKOS_INLINE_FUNCTION
constexpr const AUTO_t & operator()() const { return *this ; }
};
namespace {
/**\brief Token to indicate that a parameter's value is to be automatically selected */
constexpr AUTO_t AUTO = Kokkos::AUTO_t();
}
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Forward declarations for class inter-relationships
@ -58,6 +74,12 @@ namespace Kokkos {
class HostSpace ; ///< Memory space for main process and CPU execution spaces
#ifdef KOKKOS_HAVE_HBWSPACE
namespace Experimental {
class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
}
#endif
#if defined( KOKKOS_HAVE_SERIAL )
class Serial ; ///< Execution space main process on CPU
#endif // defined( KOKKOS_HAVE_SERIAL )
@ -162,9 +184,15 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
void fence();
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */

View File

@ -75,6 +75,10 @@ public:
typedef unsigned int size_type ;
/*--------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Impl::CudaMallocAllocator allocator;
/** \brief Allocate a contiguous block of memory.
@ -96,6 +100,8 @@ public:
);
#endif
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
/*--------------------------------*/
CudaSpace();
@ -103,10 +109,10 @@ public:
CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
~CudaSpace() = default ;
/**\brief Allocate memory in the cuda space */
/**\brief Allocate untracked memory in the cuda space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
/**\brief Deallocate untracked memory in the cuda space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
@ -162,6 +168,10 @@ public:
/** \brief If UVM capability is available */
static bool available();
/*--------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Impl::CudaUVMAllocator allocator;
/** \brief Allocate a contiguous block of memory.
@ -182,6 +192,9 @@ public:
, ::cudaChannelFormatDesc const & desc
);
#endif
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
/*--------------------------------*/
CudaUVMSpace();
@ -189,10 +202,10 @@ public:
CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
~CudaUVMSpace() = default ;
/**\brief Allocate memory in the cuda space */
/**\brief Allocate untracked memory in the cuda space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
/**\brief Deallocate untracked memory in the cuda space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
@ -223,6 +236,9 @@ public:
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef unsigned int size_type ;
/*--------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Impl::CudaHostAllocator allocator ;
@ -234,6 +250,8 @@ public:
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
/*--------------------------------*/
CudaHostPinnedSpace();
@ -241,10 +259,10 @@ public:
CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
~CudaHostPinnedSpace() = default ;
/**\brief Allocate memory in the cuda space */
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the cuda space */
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
@ -631,8 +649,24 @@ public:
static SharedAllocationRecord * allocate( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
);
, const size_t arg_alloc_size );
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::CudaSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
template< typename AliasType >
inline
@ -660,8 +694,6 @@ public:
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
}
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
};
@ -704,6 +736,24 @@ public:
, const size_t arg_alloc_size
);
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
template< typename AliasType >
inline
::cudaTextureObject_t attach_texture_object()
@ -731,8 +781,6 @@ public:
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
}
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
};
@ -772,6 +820,21 @@ public:
, const std::string & arg_label
, const size_t arg_alloc_size
);
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );

View File

@ -78,8 +78,9 @@ template< class Arg0 = void , class Arg1 = void , class Arg2 = void
, class ExecSpace =
// The first argument is the execution space,
// otherwise use the default execution space.
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
typename std::conditional
< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
>
class RangePolicy {
private:
@ -117,8 +118,8 @@ private:
) >::value };
// The work argument tag is the first or second argument
typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
typename Impl::if_c< Arg1_WorkTag , Arg1 , void
typedef typename std::conditional< Arg0_WorkTag , Arg0 ,
typename std::conditional< Arg1_WorkTag , Arg1 , void
>::type >::type
WorkTag ;
@ -128,17 +129,18 @@ private:
unsigned(DefaultIntValue) ))) };
// Only accept the integral type if the blocking is a power of two
typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
typename Impl::if_c< Arg0_IntType , Arg0 ,
typename Impl::if_c< Arg1_IntType , Arg1 ,
typename Impl::if_c< Arg2_IntType , Arg2 ,
typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
DefaultIntType
>::type >::type >::type
>::type >::type >::type
>::type
static_assert( Impl::is_integral_power_of_two( Granularity )
, "RangePolicy blocking granularity must be power of two" );
typedef typename std::conditional< Arg0_IntType , Arg0 ,
typename std::conditional< Arg1_IntType , Arg1 ,
typename std::conditional< Arg2_IntType , Arg2 ,
typename std::conditional< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
typename std::conditional< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
typename std::conditional< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
DefaultIntType
>::type >::type >::type
>::type >::type >::type
IntType ;
enum { GranularityMask = IntType(Granularity) - 1 };
@ -187,8 +189,8 @@ public:
* Typically used to partition a range over a group of threads.
*/
struct WorkRange {
typedef RangePolicy::work_tag work_tag ;
typedef RangePolicy::member_type member_type ;
typedef typename RangePolicy::work_tag work_tag ;
typedef typename RangePolicy::member_type member_type ;
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
@ -233,6 +235,38 @@ public:
namespace Kokkos {
namespace Experimental {
/** \brief Scratch memory request accepting per team and per thread value
*
* An instance of this class can be given as the last argument to a
* TeamPolicy constructor. It sets the amount of user requested shared
* memory for the team.
*/
template< class MemorySpace >
class TeamScratchRequest {
size_t m_per_team;
size_t m_per_thread;
public:
TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
m_per_team(per_team_), m_per_thread(per_thread_) {
}
size_t per_team() const {
return m_per_team;
}
size_t per_thread() const {
return m_per_thread;
}
size_t total(const size_t team_size) const {
return m_per_team + m_per_thread * team_size;
}
};
}
/** \brief Execution policy for parallel work over a league of teams of threads.
*
* The work functor is called for each thread of each team such that
@ -258,8 +292,9 @@ template< class Arg0 = void
, class ExecSpace =
// If the first argument is not an execution
// then use the default execution space.
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
typename std::conditional
< Impl::is_execution_space< Arg0 >::value , Arg0
, Kokkos::DefaultExecutionSpace >::type
>
class TeamPolicy {
private:
@ -268,7 +303,7 @@ private:
enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
enum { ArgOption_OK = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
typedef typename std::conditional< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
public:
@ -300,10 +335,20 @@ public:
static int team_size_recommended( const FunctorType & , const int&);
//----------------------------------------
/** \brief Construct policy with the given instance of the execution space */
TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
TeamPolicy( const execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
TeamPolicy( const execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
/** \brief Construct policy with the default instance of the execution space */
TeamPolicy( int league_size_request , int team_size_request );
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 );
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
template<class MemorySpace>
TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
template<class MemorySpace>
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
/** \brief The actual league size (number of teams) of the policy.
*

View File

@ -0,0 +1,327 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HBWSPACE_HPP
#define KOKKOS_HBWSPACE_HPP
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_HBWAllocators.hpp>
/*--------------------------------------------------------------------------*/
#ifdef KOKKOS_HAVE_HBWSPACE
namespace Kokkos {
namespace Experimental {
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
///
/// Arbitrary atomics are implemented using a hash table of locks
/// where the hash value is derived from the address of the
/// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset).
void init_lock_array_hbw_space();
/// \brief Aquire a lock for the address
///
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
bool lock_address_hbw_space(void* ptr);
/// \brief Release lock for the address
///
/// This function releases the lock for the hash value derived
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
void unlock_address_hbw_space(void* ptr);
} // namespace Impl
} // neamspace Experimental
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
/// \class HBWSpace
/// \brief Memory management for host memory.
///
/// HBWSpace is a memory space that governs host memory. "Host"
/// memory means the usual CPU-accessible memory.
class HBWSpace {
public:
//! Tag this class as a kokkos memory space
typedef HBWSpace memory_space ;
typedef size_t size_type ;
/// \typedef execution_space
/// \brief Default execution space for this memory space.
///
/// Every memory space has a default execution space. This is
/// useful for things like initializing a View (which happens in
/// parallel using the View's default execution space).
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Kokkos::Threads execution_space ;
#elif defined( KOKKOS_HAVE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
#elif defined( KOKKOS_HAVE_PTHREAD )
typedef Kokkos::Threads execution_space ;
#elif defined( KOKKOS_HAVE_SERIAL )
typedef Kokkos::Serial execution_space ;
#else
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
#endif
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Impl::HBWMallocAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
/*--------------------------------*/
/* Functions unique to the HBWSpace */
static int in_parallel();
static void register_in_parallel( int (*)() );
/*--------------------------------*/
/**\brief Default memory space instance */
HBWSpace();
HBWSpace( const HBWSpace & rhs ) = default ;
HBWSpace & operator = ( const HBWSpace & ) = default ;
~HBWSpace() = default ;
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
explicit
HBWSpace( const AllocationMechanism & );
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
private:
AllocationMechanism m_alloc_mech ;
friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
};
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
: public SharedAllocationRecord< void , void >
{
private:
friend Kokkos::Experimental::HBWSpace ;
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
static void deallocate( RecordBase * );
/**\brief Root record for tracked allocations from this HBWSpace instance */
static RecordBase s_root_record ;
const Kokkos::Experimental::HBWSpace m_space ;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() = default ;
SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
, const RecordBase::function_type arg_dealloc = & deallocate
);
public:
inline
std::string get_label() const
{
return std::string( RecordBase::head()->m_label );
}
KOKKOS_INLINE_FUNCTION static
SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
#else
return (SharedAllocationRecord *) 0 ;
#endif
}
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
};
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<class ExecutionSpace>
struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
memcpy( dst , src , n );
}
};
template<class ExecutionSpace>
struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
memcpy( dst , src , n );
}
};
template<class ExecutionSpace>
struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
exec.fence();
memcpy( dst , src , n );
}
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
};
} // namespace Impl
} // namespace Kokkos
#endif
#endif /* #define KOKKOS_HBWSPACE_HPP */

View File

@ -128,6 +128,8 @@ public:
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
typedef Impl::PageAlignedAllocator allocator ;
@ -143,6 +145,8 @@ public:
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
/*--------------------------------*/
/* Functions unique to the HostSpace */
static int in_parallel();
@ -164,10 +168,10 @@ public:
explicit
HostSpace( const AllocationMechanism & );
/**\brief Allocate memory in the host space */
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
/**\brief Deallocate memory in the host space */
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
@ -239,6 +243,21 @@ public:
#endif
}
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::HostSpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc_size );
/**\brief Reallocate tracked memory in the space */
static
void * reallocate_tracked( void * const arg_alloc_ptr
, const size_t arg_alloc_size );
/**\brief Deallocate tracked memory in the space */
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );

View File

@ -157,10 +157,15 @@ struct LayoutStride {
/// both tile dimensions are powers of two, Kokkos can optimize
/// further.
template < unsigned ArgN0 , unsigned ArgN1 ,
bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
Impl::is_power_of_two<ArgN1>::value )
bool IsPowerOfTwo = ( Impl::is_integral_power_of_two(ArgN0) &&
Impl::is_integral_power_of_two(ArgN1) )
>
struct LayoutTileLeft {
static_assert( Impl::is_integral_power_of_two(ArgN0) &&
Impl::is_integral_power_of_two(ArgN1)
, "LayoutTileLeft must be given power-of-two tile dimensions" );
//! Tag this class as a kokkos array layout
typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;

View File

@ -416,5 +416,11 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
#define KOKKOS_POSIX_MEMALIGN_AVAILABLE 1
#endif
#endif
#endif /* #ifndef KOKKOS_MACROS_HPP */

View File

@ -101,9 +101,9 @@ namespace Impl {
*/
enum { MEMORY_ALIGNMENT =
#if defined( KOKKOS_MEMORY_ALIGNMENT )
( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
#else
( 1 << Kokkos::Impl::power_of_two< 128 >::value )
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
#endif
, MEMORY_ALIGNMENT_THRESHOLD = 4
};

View File

@ -53,6 +53,9 @@
#include <cstddef>
#include <iosfwd>
#include <Kokkos_HostSpace.hpp>
#ifdef KOKKOS_HAVE_HBWSPACE
#include <Kokkos_HBWSpace.hpp>
#endif
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
#include <Kokkos_Layout.hpp>
@ -72,12 +75,16 @@ public:
//! Tag this class as a kokkos execution space
typedef OpenMP execution_space ;
#ifdef KOKKOS_HAVE_HBWSPACE
typedef Experimental::HBWSpace memory_space ;
#else
typedef HostSpace memory_space ;
#endif
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef LayoutRight array_layout ;
typedef HostSpace::size_type size_type ;
typedef memory_space::size_type size_type ;
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;

View File

@ -207,8 +207,12 @@ void parallel_for( const ExecPolicy & policy
}
#endif
(void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID);
@ -235,7 +239,11 @@ void parallel_for( const size_t work_count
}
#endif
(void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -333,7 +341,11 @@ void parallel_reduce( const ExecPolicy & policy
}
#endif
(void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -376,7 +388,11 @@ void parallel_reduce( const size_t work_count
}
#endif
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -394,7 +410,7 @@ void parallel_reduce( const ExecPolicy & policy
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<
( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
#endif
@ -408,7 +424,11 @@ void parallel_reduce( const ExecPolicy & policy
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -465,7 +485,11 @@ void parallel_reduce( const ExecPolicy & policy
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -482,7 +506,7 @@ void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<( Impl::is_view<ViewType>::value
, typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
@ -503,7 +527,11 @@ void parallel_reduce( const size_t work_count
}
#endif
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -564,7 +592,11 @@ void parallel_reduce( const size_t work_count
}
#endif
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -813,7 +845,11 @@ void parallel_scan( const ExecutionPolicy & policy
}
#endif
Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {
@ -842,7 +878,11 @@ void parallel_scan( const size_t work_count
}
#endif
(void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#ifdef KOKKOSP_ENABLE_PROFILING
if(Kokkos::Experimental::profileLibraryLoaded()) {

View File

@ -151,7 +151,7 @@ public:
static void finalize() {}
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool detail = false ) {}
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
//--------------------------------------------------------------------------
@ -295,6 +295,7 @@ class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
private:
const int m_league_size ;
const int m_scratch_size ;
public:
@ -326,15 +327,55 @@ public:
inline int team_size() const { return 1 ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_scratch_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
TeamPolicy( execution_space &
, int league_size_request
, int /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size( league_size_request )
{ (void) vector_length_request; }
, m_scratch_size ( 0 )
{}
TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
TeamPolicy( execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size( league_size_request )
{ (void) vector_length_request; }
, m_scratch_size ( 0 )
{}
TeamPolicy( int league_size_request
, int /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size( league_size_request )
, m_scratch_size ( 0 )
{}
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size( league_size_request )
, m_scratch_size ( 0 )
{}
template<class MemorySpace>
TeamPolicy( int league_size_request
, int /* team_size_request */
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_league_size(league_size_request)
, m_scratch_size(scratch_request.total(1))
{}
template<class MemorySpace>
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_league_size(league_size_request)
, m_scratch_size(scratch_request.total(1))
{}
typedef Impl::SerialTeamMember member_type ;
};
@ -346,53 +387,69 @@ public:
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Parallel patterns for Kokkos::Serial with RangePolicy */
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
public:
// work tag is void
template< class PType >
inline
ParallelFor( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec() const
{
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i );
}
}
// work tag is non-void
template< class PType >
inline
ParallelFor( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
template< class TagType >
KOKKOS_INLINE_FUNCTION
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec() const
{
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
const TagType t{} ;
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i );
}
}
public:
inline
void execute() const
{ this-> template exec< typename Policy::work_tag >(); }
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
/*--------------------------------------------------------------------------*/
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
>
{
public:
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
@ -401,123 +458,136 @@ public:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
// Work tag is void
template< class ViewType , class PType >
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy
, const ViewType & result
)
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
pointer_type result_ptr = result.ptr_on_device();
reference_type update = ValueInit::init( m_functor , ptr );
if ( ! result_ptr ) {
result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update );
}
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
}
// Work tag is non-void
template< class ViewType , class PType >
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy
, const ViewType & result
)
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
pointer_type result_ptr = result.ptr_on_device();
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
if ( ! result_ptr ) {
result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update );
}
typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
}
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
}
public:
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
inline
void execute() const
{
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , 0 );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
}
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
}
};
/*--------------------------------------------------------------------------*/
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
public:
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
// work tag is void
template< class PType >
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
inline
ParallelScan( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
pointer_type result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( i , update , true );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update , true );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
}
// work tag is non-void
template< class PType >
template< class TagType >
inline
ParallelScan( typename Impl::enable_if<
( Impl::is_same< PType , Policy >::value &&
! Impl::is_same< typename PType::work_tag , void >::value
), const FunctorType & >::type functor
, const PType & policy )
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
pointer_type result_ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
reference_type update = ValueInit::init( functor , result_ptr );
const typename PType::member_type e = policy.end();
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , true );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update , true );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
}
public:
inline
void execute() const
{
pointer_type ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
this-> template exec< WorkTag >( ptr );
}
inline
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy
)
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
} // namespace Impl
@ -525,112 +595,157 @@ public:
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Parallel patterns for Kokkos::Serial with TeamPolicy */
namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
>
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const int m_league ;
const int m_shared ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( member ); }
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( TagType() , member ); }
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec() const
{
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
ParallelFor::template driver< typename Policy::work_tag >
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
// functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) );
}
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec() const
{
const TagType t{} ;
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) );
}
}
public:
inline
void execute() const
{
Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
this-> template exec< typename Policy::work_tag >();
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
{ }
};
/*--------------------------------------------------------------------------*/
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
>
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
public:
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
private:
const FunctorType m_functor ;
const int m_league ;
const int m_shared ;
pointer_type m_result_ptr ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member
, reference_type update )
{ functor( member , update ); }
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
reference_type update = ValueInit::init( m_functor , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member
, reference_type update )
{ functor( TagType() , member , update ); }
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
{
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
final( m_functor , ptr );
}
public:
template< class ViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const ViewType & result
)
inline
void execute() const
{
const int reduce_size = ValueTraits::value_size( functor );
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , m_shared );
const pointer_type result_ptr =
result.ptr_on_device() ? result.ptr_on_device()
: (pointer_type) scratch_reduce ;
reference_type update = ValueInit::init( functor , result_ptr );
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
ParallelReduce::template driver< typename Policy::work_tag >
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
}
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
}
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result
)
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
, m_result_ptr( arg_result.ptr_on_device() )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
}
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
namespace Kokkos {
namespace Impl {
template<typename iType>
@ -739,8 +854,6 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Ser
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
#ifdef KOKKOS_HAVE_CXX11
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
@ -764,8 +877,6 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Ser
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
}
#endif // KOKKOS_HAVE_CXX11
} //namespace Kokkos
namespace Kokkos {

View File

@ -47,11 +47,12 @@
#include <type_traits>
#include <string>
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Shape.hpp>
@ -444,14 +445,14 @@ template< class DataType ,
typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
class View ;
namespace Impl {
template< class C >
struct is_view : public bool_< false > {};
struct is_view : public Impl::bool_< false > {};
template< class D , class A1 , class A2 , class A3 , class S >
struct is_view< View< D , A1 , A2 , A3 , S > > : public bool_< true > {};
struct is_view< View< D , A1 , A2 , A3 , S > > : public Impl::bool_< true > {};
namespace Impl {
using Kokkos::is_view ;
}
//----------------------------------------------------------------------------
@ -952,33 +953,37 @@ public:
Impl::ViewError::scalar_operator_called_from_non_scalar_view >
if_scalar_operator ;
typedef Impl::if_c< traits::rank == 0 ,
reference_type ,
Impl::ViewError::scalar_operator_called_from_non_scalar_view >
if_scalar_operator_return ;
KOKKOS_INLINE_FUNCTION
const View & operator = ( const typename if_scalar_operator::type & rhs ) const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
*m_ptr_on_device = if_scalar_operator::select( rhs );
m_ptr_on_device[ 0 ] = if_scalar_operator::select( rhs );
return *this ;
}
KOKKOS_FORCEINLINE_FUNCTION
operator typename if_scalar_operator::type & () const
operator typename if_scalar_operator_return::type () const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
return if_scalar_operator::select( *m_ptr_on_device );
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
}
KOKKOS_FORCEINLINE_FUNCTION
typename if_scalar_operator::type & operator()() const
typename if_scalar_operator_return::type operator()() const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
return if_scalar_operator::select( *m_ptr_on_device );
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
}
KOKKOS_FORCEINLINE_FUNCTION
typename if_scalar_operator::type & operator*() const
typename if_scalar_operator_return::type operator*() const
{
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
return if_scalar_operator::select( *m_ptr_on_device );
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
}
//------------------------------------
@ -1849,6 +1854,8 @@ void resize( View<T,L,D,M,S> & v ,
Impl::ViewRemap< view_type , view_type >( v_resized , v );
view_type::execution_space::fence();
v = v_resized ;
}
@ -2092,27 +2099,10 @@ struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#include <KokkosExp_View.hpp>
#else
// Must define before includng <impl/Kokkos_ViewOffset.hpp>
namespace Kokkos {
namespace Experimental {
namespace Impl {
struct ALL_t ;
}
}
using ALL = Experimental::Impl::ALL_t ;
}
#include <impl/Kokkos_ViewOffset.hpp>
#include <impl/Kokkos_ViewSupport.hpp>
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
#include <KokkosExp_View.hpp>
#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -109,7 +109,7 @@ unsigned thread_mapping( const char * const label ,
/** \brief Query core-coordinate of the current thread
* with respect to the core_topology.
*
* As long as the thread is running within the
* As long as the thread is running within the
* process binding the following condition holds.
*
* core_coordinate.first < core_topology.first
@ -120,6 +120,10 @@ std::pair<unsigned,unsigned> get_this_thread_coordinate();
/** \brief Bind the current thread to a core. */
bool bind_this_thread( const std::pair<unsigned,unsigned> );
/** \brief Can hwloc bind threads? */
bool can_bind_threads();
/** \brief Bind the current thread to one of the cores in the list.
* Set that entry to (~0,~0) and return the index.
* If binding fails return ~0.

View File

@ -4,14 +4,14 @@ PREFIX ?= /usr/local/lib/kokkos
default: messages build-lib
echo "End Build"
include $(KOKKOS_PATH)/Makefile.kokkos
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
CXX = nvcc_wrapper
CXX = $(NVCC_WRAPPER)
CXXFLAGS ?= -O3
LINK = nvcc_wrapper
LINK = $(NVCC_WRAPPER)
LINKFLAGS ?=
else
CXX ?= g++
@ -62,8 +62,10 @@ build-makefile-kokkos:
echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
echo "CXX ?= $(CXX)" >> Makefile.kokkos
echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
echo "" >> Makefile.kokkos
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
@ -90,6 +92,7 @@ build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
mkdir:
mkdir -p $(PREFIX)
mkdir -p $(PREFIX)/bin
mkdir -p $(PREFIX)/include
mkdir -p $(PREFIX)/lib
mkdir -p $(PREFIX)/include/impl
@ -97,7 +100,7 @@ mkdir:
copy-cuda: mkdir
mkdir -p $(PREFIX)/include/Cuda
cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
copy-threads: mkdir
mkdir -p $(PREFIX)/include/Threads
cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
@ -111,13 +114,14 @@ copy-openmp: mkdir
cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
install: mkdir $(CONDITIONAL_COPIES) build-lib
cp $(NVCC_WRAPPER) $(PREFIX)/bin
cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
cp Makefile.kokkos $(PREFIX)
cp libkokkos.a $(PREFIX)/lib
cp KokkosCore_config.h $(PREFIX)/include
clean: kokkos-clean
rm Makefile.kokkos

View File

@ -57,41 +57,57 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const PType & range )
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork );
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork );
}
}
public:
inline
ParallelFor( const FunctorType & functor
, const Policy & policy )
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
@ -99,10 +115,20 @@ public:
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
}
/* END #pragma omp parallel */
}
inline
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
} // namespace Impl
@ -115,90 +141,119 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range )
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork , update );
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork , update );
}
}
public:
inline
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
}
/* END #pragma omp parallel */
// Reduction:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
}
//----------------------------------------
template< class ViewType >
inline
ParallelReduce( typename Impl::enable_if<
( Impl::is_view< ViewType >::value &&
Impl::is_same< typename ViewType::memory_space , HostSpace >::value
), const FunctorType & >::type functor
, const Policy & policy
, const ViewType & result_view )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
#pragma omp parallel
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result_view )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() )
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
driver( functor
, ValueInit::init( functor , exec.scratch_reduce() )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
);
static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
}
/* END #pragma omp parallel */
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
if ( result_view.ptr_on_device() ) {
const int n = ValueTraits::value_count( functor );
for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
}
}
}
};
} // namespace Impl
@ -211,106 +266,129 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , WorkTag > ValueOps ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range
, const bool final )
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( iwork , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, reference_type update
, const PType & range
, const bool final )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
const typename PType::member_type work_end = range.end();
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
functor( typename PType::work_tag() , iwork , update , final );
const TagType t{} ;
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
#pragma ivdep
#endif
#endif
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
functor( t , iwork , update , final );
}
}
public:
inline
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr =
pointer_type( exec.scratch_reduce() ) +
ValueTraits::value_count( m_functor );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , ptr ) , false );
}
/* END #pragma omp parallel */
{
const unsigned thread_count = OpenMPexec::pool_size();
const unsigned value_count = ValueTraits::value_count( m_functor );
pointer_type ptr_prev = 0 ;
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
if ( ptr_prev ) {
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
ValueJoin::join( m_functor , ptr + value_count , ptr );
}
else {
ValueInit::init( m_functor , ptr );
}
ptr_prev = ptr ;
}
}
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueOps::reference( ptr ) , true );
}
/* END #pragma omp parallel */
}
//----------------------------------------
inline
ParallelScan( const FunctorType & functor
, const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor
, ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
, false );
}
/* END #pragma omp parallel */
{
const unsigned thread_count = OpenMPexec::pool_size();
const unsigned value_count = ValueTraits::value_count( functor );
pointer_type ptr_prev = 0 ;
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
if ( ptr_prev ) {
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
ValueJoin::join( functor , ptr + value_count , ptr );
}
else {
ValueInit::init( functor , ptr );
}
ptr_prev = ptr ;
}
}
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
driver( functor
, ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
, true );
}
/* END #pragma omp parallel */
}
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
//----------------------------------------
};
@ -325,62 +403,84 @@ namespace Kokkos {
namespace Impl {
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
>
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
const int m_shmem_size ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( member ); }
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
for ( ; member.valid() ; member.next() ) {
functor( member );
}
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const FunctorType & >::type functor
, const typename Policy::member_type & member )
{ functor( TagType() , member ); }
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
const TagType t{} ;
for ( ; member.valid() ; member.next() ) {
functor( t , member );
}
}
public:
inline
ParallelFor( const FunctorType & functor ,
const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
#pragma omp parallel
{
typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
for ( ; member.valid() ; member.next() ) {
ParallelFor::template driver< typename Policy::work_tag >( functor , member );
{
ParallelFor::template exec_team< WorkTag >
( m_functor
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
}
}
/* END #pragma omp parallel */
}
}
void wait() {}
inline
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
};
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
>
{
private:
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
@ -388,102 +488,85 @@ private:
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
const int m_shmem_size ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const typename PType::member_type & member
, reference_type update )
{ functor( member , update ); }
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
for ( ; member.valid() ; member.next() ) {
functor( member , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
const FunctorType & >::type functor
, const typename PType::member_type & member
, reference_type update )
{ functor( typename PType::work_tag() , member , update ); }
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
const TagType t{} ;
for ( ; member.valid() ; member.next() ) {
functor( t , member , update );
}
}
public:
inline
ParallelReduce( const FunctorType & functor ,
const Policy & policy )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
ParallelReduce::template driver< Policy >( functor , member , update );
ParallelReduce::template exec_team< WorkTag >
( m_functor
, Member( exec , m_policy , m_shmem_size )
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
}
}
/* END #pragma omp parallel */
{
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type > Join ;
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
int max_active_threads = OpenMPexec::pool_size();
if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
max_active_threads = m_policy.league_size()* m_policy.team_size();
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
for ( int i = 1 ; i < max_active_threads ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
}
}
template< class ViewType >
inline
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const ViewType & result )
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
ParallelReduce::template driver< Policy >( functor , member , update );
}
}
/* END #pragma omp parallel */
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
}
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
const int n = ValueTraits::value_count( functor );
for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
}
}
void wait() {}
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{}
};
} // namespace Impl

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -84,8 +84,16 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
OpenMPexec::Pool OpenMPexec::m_pool;
#else
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
#endif
void OpenMPexec::verify_is_process( const char * const label )
{
if ( omp_in_parallel() ) {
@ -102,6 +110,13 @@ void OpenMPexec::verify_initialized( const char * const label )
msg.append( " ERROR: not initialized" );
Kokkos::Impl::throw_runtime_exception( msg );
}
if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
std::string msg( label );
msg.append( " ERROR: Initialized but threads modified inappropriately" );
Kokkos::Impl::throw_runtime_exception( msg );
}
}
void OpenMPexec::clear_scratch()
@ -109,7 +124,16 @@ void OpenMPexec::clear_scratch()
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_pool[ rank_rev ] ) {
Record * const r = Record::get_record( m_pool[ rank_rev ] );
m_pool[ rank_rev ] = 0 ;
Record::decrement( r );
}
#else
m_pool.at(rank_rev).clear();
#endif
}
/* END #pragma omp parallel */
}
@ -147,7 +171,27 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
const int rank = pool_size - ( rank_rev + 1 );
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::HostSpace()
, "openmp_scratch"
, alloc_size );
Record::increment( r );
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
#else
#pragma omp critical
{
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
}
#endif
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
}
/* END #pragma omp parallel */
@ -248,7 +292,9 @@ void OpenMP::initialize( unsigned thread_count ,
// Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
const unsigned omp_rank = omp_get_thread_num();
const unsigned thread_r = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
: omp_rank ;
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
}
@ -293,7 +339,7 @@ void OpenMP::finalize()
omp_set_num_threads(1);
if ( Impl::s_using_hwloc ) {
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
hwloc::unbind_this_thread();
}
}

View File

@ -61,6 +61,8 @@ public:
enum { MAX_THREAD_COUNT = 4096 };
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
struct Pool
{
Pool() : m_trackers() {}
@ -78,11 +80,21 @@ public:
}
};
private:
static Pool m_pool; // Indexed by: m_pool_rank_rev
#else
private:
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
#endif
static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ];
static Pool m_pool; // Indexed by: m_pool_rank_rev
friend class Kokkos::OpenMP ;
@ -193,12 +205,14 @@ private:
inline
bool team_fan_in() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
}
if ( m_team_rank_rev ) {
m_exec.state_set( Rendezvous );
memory_fence();
m_exec.state_wait( Rendezvous );
}
@ -208,8 +222,10 @@ private:
inline
void team_fan_out() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
memory_fence();
}
}
@ -265,6 +281,7 @@ public:
{ return ValueType(); }
#else
{
memory_fence();
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
#endif
@ -301,6 +318,7 @@ public:
for ( int i = 1 ; i < m_team_size ; ++i ) {
op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
}
memory_fence();
// The base team member may "lap" the other team members,
// copy to their local value before proceeding.
@ -484,6 +502,8 @@ private:
int m_team_alloc ;
int m_team_iter ;
size_t m_scratch_size;
inline void init( const int league_size_request
, const int team_size_request )
{
@ -511,13 +531,49 @@ public:
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_scratch_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
TeamPolicy( execution_space &
, int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_scratch_size ( 0 )
{ init( league_size_request , team_size_request ); }
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
TeamPolicy( execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1)
: m_scratch_size ( 0 )
{ init( league_size_request , execution_space::thread_pool_size(2) ); }
TeamPolicy( int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_scratch_size ( 0 )
{ init( league_size_request , team_size_request ); }
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_scratch_size ( 0 )
{ init( league_size_request , execution_space::thread_pool_size(2) ); }
template<class MemorySpace>
TeamPolicy( int league_size_request
, int team_size_request
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_scratch_size(scratch_request.total(team_size_request))
{ init(league_size_request,team_size_request); }
template<class MemorySpace>
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_scratch_size(scratch_request.total(execution_space::thread_pool_size(2)))
{ init(league_size_request,execution_space::thread_pool_size(2)); }
inline int team_alloc() const { return m_team_alloc ; }
inline int team_iter() const { return m_team_iter ; }

View File

@ -212,7 +212,7 @@ public:
// Join from lower ranking to higher ranking worker.
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
for ( int i = m_worker_size - 1 ; --i ; ) {
for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
}
}

View File

@ -61,47 +61,50 @@ namespace Impl {
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
const FunctorType m_func ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
const TagType t{} ;
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i );
}
}
// Function is called once by every concurrent thread.
static void execute( QthreadExec & exec , const void * arg )
static void exec( QthreadExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
ParallelFor::template exec_range< WorkTag > ( self.m_functor , range.begin() , range.end() );
// All threads wait for completion.
exec.exec_all_barrier();
@ -109,95 +112,110 @@ private:
public:
ParallelFor( const FunctorType & functor
, const Policy & policy
)
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy
)
: m_functor( arg_functor )
, m_policy( arg_policy )
{ }
};
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Qthread >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
const TagType t{} ;
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i , update );
}
}
static void execute( QthreadExec & exec , const void * arg )
static void exec( QthreadExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
driver( self.m_func
, ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
, typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
);
const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
ParallelReduce::template exec_range< WorkTag >(
self.m_functor, range.begin(), range.end(),
ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
}
public:
template< class HostViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const HostViewType & result_view )
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
if ( result_view.ptr_on_device() ) {
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const HostViewType & arg_result_view )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() )
{ }
};
//----------------------------------------------------------------------------
@ -208,50 +226,63 @@ class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
const FunctorType m_func ;
const Policy m_team ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( member ); }
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
while ( member ) {
functor( member );
member.team_barrier();
member.next_team();
}
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( TagType() , member ); }
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
const TagType t{} ;
while ( member ) {
functor( t , member );
member.team_barrier();
member.next_team();
}
}
static void execute( QthreadExec & exec , const void * arg )
static void exec( QthreadExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
typename Policy::member_type member( exec , self.m_team );
while ( member ) {
self.ParallelFor::template driver< typename Policy::work_tag >( member );
member.team_barrier();
member.next_team();
}
ParallelFor::template exec_team< WorkTag >
( self.m_functor , Member( exec , self.m_policy ) );
exec.exec_all_barrier();
}
public:
ParallelFor( const FunctorType & functor ,
const Policy & policy )
: m_func( functor )
, m_team( policy )
inline
void execute() const
{
QthreadExec::resize_worker_scratch
( /* reduction memory */ 0
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
}
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{ }
};
//----------------------------------------------------------------------------
@ -263,148 +294,170 @@ private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const Policy m_team ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( member , update ); }
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
while ( member ) {
functor( member , update );
member.team_barrier();
member.next_team();
}
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( TagType() , member , update ); }
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
const TagType t{} ;
while ( member ) {
functor( t , member , update );
member.team_barrier();
member.next_team();
}
}
static void execute( QthreadExec & exec , const void * arg )
static void exec( QthreadExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
ParallelReduce::template exec_team< WorkTag >
( self.m_functor
, Member( exec , self.m_policy )
, ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
typename Policy::member_type member( exec , self.m_team );
while ( member ) {
self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
member.team_barrier();
member.next_team();
}
exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
}
public:
template< class ViewType >
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const ViewType & result )
: m_func( functor )
, m_team( policy )
inline
void execute() const
{
QthreadExec::resize_worker_scratch
( /* reduction memory */ ValueTraits::value_size( functor )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
( /* reduction memory */ ValueTraits::value_size( m_functor )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class ViewType >
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const ViewType & arg_result )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() )
{ }
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend
, reference_type update , const bool final )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , final );
const TagType t{} ;
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i , update , final );
}
}
static void execute( QthreadExec & exec , const void * arg )
static void exec( QthreadExec & exec , const void * arg )
{
const ParallelScan & self = * ((const ParallelScan *) arg );
const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
const WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
reference_type update = ValueInit::init( self.m_functor , exec.exec_all_reduce_value() );
driver( self.m_func , update , false , range );
ParallelScan::template exec_range< WorkTag >( self.m_functor, range.begin() , range.end() , update , false );
exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_functor );
driver( self.m_func , update , true , range );
ParallelScan::template exec_range< WorkTag >( self.m_functor , range.begin() , range.end() , update , true );
exec.exec_all_barrier();
}
public:
ParallelScan( const FunctorType & functor
, const Policy & policy
)
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
}
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy
)
: m_functor( arg_functor )
, m_policy( arg_policy )
{
}
};

View File

@ -255,6 +255,56 @@ void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
//----------------------------------------------------------------------------
void Task::closeout()
{
enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };
#if 0
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx %s\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(this)
, ( m_state == RESPAWN ? "respawn" : "complete" )
);
fflush(stdout);
#endif
// When dependent tasks run there would be a race
// condition between destroying this task and
// querying the active count pointer from this task.
int volatile * const active_count = m_active_count ;
if ( m_state == RESPAWN ) {
// Task requests respawn, set state to waiting and reschedule the task
m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
schedule();
}
else {
// Task did not respawn, is complete
m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
// Release dependences before allowing dependent tasks to run.
// Otherwise there is a thread race condition for removing dependences.
for ( int i = 0 ; i < m_dep_size ; ++i ) {
assign( & m_dep[i] , 0 );
}
// Set qthread FEB to full so that dependent tasks are allowed to execute.
// This 'task' may be deleted immediately following this function call.
qthread_fill( & m_qfeb );
// The dependent task could now complete and destroy 'this' task
// before the call to 'qthread_fill' returns. Therefore, for
// thread safety assume that 'this' task has now been destroyed.
}
// Decrement active task count before returning.
Kokkos::atomic_decrement( active_count );
}
aligned_t Task::qthread_func( void * arg )
{
Task * const task = reinterpret_cast< Task * >(arg);
@ -291,62 +341,18 @@ fflush(stdout);
#endif
member.team_barrier();
close_out = member.team_rank() == 0 ;
if ( member.team_rank() == 0 ) task->closeout();
member.team_barrier();
}
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
// Team hard-wired to one, no cloning
Kokkos::Impl::QthreadTeamPolicyMember member ;
(*task->m_apply_team)( task , member );
close_out = true ;
task->closeout();
}
else {
(*task->m_apply_single)( task );
close_out = true ;
}
if ( close_out ) {
// When dependent tasks run there would be a race
// condition between destroying this task and
// querying the active count pointer from this task.
int volatile * active_count = task->m_active_count ;
if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) {
#if 0
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx respawn\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(task)
);
fflush(stdout);
#endif
// Task respawned, set state to waiting and reschedule the task
task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
task->schedule();
}
else {
// Task did not respawn, is complete
task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
// Release dependences before allowing dependent tasks to run.
// Otherwise there is a thread race condition for removing dependences.
for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
assign( & task->m_dep[i] , 0 );
}
// Set qthread FEB to full so that dependent tasks are allowed to execute.
// This 'task' may be deleted immediately following this function call.
qthread_fill( & task->m_qfeb );
}
// Decrement active task count before returning.
Kokkos::atomic_decrement( active_count );
task->closeout();
}
#if 0
@ -419,8 +425,7 @@ fflush(stdout);
, NULL
, m_dep_size , qprecon /* dependences */
, spawn_shepherd
// , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
, unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY )
, unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
, num_worker_per_shepherd - 1
);
}

View File

@ -121,6 +121,7 @@ private:
}
void schedule();
void closeout();
protected :
@ -490,7 +491,7 @@ public:
KOKKOS_INLINE_FUNCTION
TaskPolicy( const TaskPolicy & rhs )
: m_default_dependence_capacity( rhs.m_default_dependence_capacity )
, m_team_size( m_team_size )
, m_team_size( rhs.m_team_size )
, m_active_count_root(0)
, m_active_count( rhs.m_active_count )
{}
@ -499,7 +500,7 @@ public:
TaskPolicy( const TaskPolicy & rhs
, const unsigned arg_default_dependence_capacity )
: m_default_dependence_capacity( arg_default_dependence_capacity )
, m_team_size( m_team_size )
, m_team_size( rhs.m_team_size )
, m_active_count_root(0)
, m_active_count( rhs.m_active_count )
{}

View File

@ -1,13 +1,13 @@
/*
//@HEADER
// ************************************************************************
//
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
@ -36,7 +36,7 @@
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
//
// ************************************************************************
//@HEADER
*/
@ -50,9 +50,7 @@
#include <utility>
#include <iostream>
#include <sstream>
#include <Kokkos_Threads.hpp>
#include <Kokkos_hwloc.hpp>
#include <Kokkos_Atomic.hpp>
#include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp>
@ -135,7 +133,11 @@ void ThreadsExec::driver(void)
ThreadsExec::ThreadsExec()
: m_pool_base(0)
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
, m_scratch()
#else
, m_scratch(0)
#endif
, m_scratch_reduce_end(0)
, m_scratch_thread_end(0)
, m_numa_rank(0)
@ -194,8 +196,25 @@ ThreadsExec::~ThreadsExec()
{
const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
m_pool_base = 0 ;
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_scratch ) {
Record * const r = Record::get_record( m_scratch );
m_scratch = 0 ;
Record::decrement( r );
}
#else
m_scratch.clear();
#endif
m_pool_base = 0 ;
m_scratch_reduce_end = 0 ;
m_scratch_thread_end = 0 ;
m_numa_rank = 0 ;
@ -303,6 +322,10 @@ void ThreadsExec::fence()
s_current_function = 0 ;
s_current_function_arg = 0 ;
// Make sure function and arguments are cleared before
// potentially re-activating threads with a subsequent launch.
memory_fence();
}
/** \brief Begin execution of the asynchronous functor */
@ -317,6 +340,9 @@ void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const vo
s_current_function = func ;
s_current_function_arg = arg ;
// Make sure function and arguments are written before activating threads.
memory_fence();
// Activate threads:
for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
@ -376,6 +402,9 @@ void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
s_current_function = func ;
s_current_function_arg = & s_threads_process ;
// Make sure function and arguments are written before activating threads.
memory_fence();
const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
@ -394,6 +423,9 @@ void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
s_current_function_arg = 0 ;
s_current_function = 0 ;
// Make sure function and arguments are cleared before proceeding.
memory_fence();
}
//----------------------------------------------------------------------------
@ -405,17 +437,51 @@ void * ThreadsExec::root_reduce_scratch()
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
{
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( exec.m_scratch ) {
Record * const r = Record::get_record( exec.m_scratch );
exec.m_scratch = 0 ;
Record::decrement( r );
}
#else
exec.m_scratch.clear();
#endif
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
if ( s_threads_process.m_scratch_thread_end ) {
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
// Allocate tracked memory:
{
Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
Record::increment( r );
exec.m_scratch = r->data();
}
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
#else
exec.m_scratch =
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
#endif
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
// touch on this thread
@ -452,7 +518,11 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
}
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
return s_threads_process.m_scratch ;
#else
return s_threads_process.m_scratch.alloc_ptr() ;
#endif
}
//----------------------------------------------------------------------------
@ -550,7 +620,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
// then they will be given default values based upon hwloc detection
// and allowed asynchronous execution.
const bool hwloc_avail = hwloc::available();
const bool hwloc_avail = Kokkos::hwloc::available();
const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();
if ( thread_count == 0 ) {
thread_count = hwloc_avail
@ -588,7 +659,11 @@ void ThreadsExec::initialize( unsigned thread_count ,
// If hwloc available then spawned thread will
// choose its own entry in 's_threads_coord'
// otherwise specify the entry.
s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );
// Make sure all outstanding memory writes are complete
// before spawning the new thread.
memory_fence();
// Spawn thread executing the 'driver()' function.
// Wait until spawned thread has attempted to initialize.
@ -617,9 +692,13 @@ void ThreadsExec::initialize( unsigned thread_count ,
s_current_function_arg = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
memory_fence();
if ( ! thread_spawn_failed ) {
// Bind process to the core on which it was located before spawning occured
Kokkos::hwloc::bind_this_thread( proc_coord );
if (hwloc_can_bind) {
Kokkos::hwloc::bind_this_thread( proc_coord );
}
if ( thread_spawn_begin ) { // Include process in pool.
const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
@ -702,7 +781,9 @@ void ThreadsExec::finalize()
s_threads_exec[0] = 0 ;
}
Kokkos::hwloc::unbind_this_thread();
if (Kokkos::hwloc::can_bind_threads() ) {
Kokkos::hwloc::unbind_this_thread();
}
s_thread_pool_size[0] = 0 ;
s_thread_pool_size[1] = 0 ;

View File

@ -89,7 +89,11 @@ private:
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
Impl::AllocationTracker m_scratch ;
#else
void * m_scratch ;
#endif
int m_scratch_reduce_end ;
int m_scratch_thread_end ;
int m_numa_rank ;
@ -122,9 +126,19 @@ public:
static int get_thread_count();
static ThreadsExec * get_thread( const int init_thread_rank );
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
#else
inline void * reduce_memory() const { return m_scratch ; }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const
{ return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
#endif
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }

View File

@ -155,6 +155,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )
#elif defined( KOKKOS_HAVE_WINTHREAD )
/* Windows libraries */
#include <winsock2.h>
#include <windows.h>
#include <process.h>

View File

@ -423,6 +423,8 @@ private:
int m_team_size ;
int m_team_alloc ;
size_t m_scratch_size;
inline
void init( const int league_size_request
, const int team_size_request )
@ -477,19 +479,68 @@ public:
inline int team_size() const { return m_team_size ; }
inline int team_alloc() const { return m_team_alloc ; }
inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_scratch_size ; }
/** \brief Specify league size, request team size */
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
TeamPolicy( execution_space &
, int league_size_request
, int team_size_request
, int vector_length_request = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_scratch_size ( 0 )
{ init(league_size_request,team_size_request); (void) vector_length_request; }
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
/** \brief Specify league size, request team size */
TeamPolicy( execution_space &
, int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
{ init(league_size_request,team_size_request); (void) vector_length_request; }
, m_scratch_size ( 0 )
{ init(league_size_request,execution_space::thread_pool_size(2)); }
TeamPolicy( int league_size_request
, int team_size_request
, int /* vector_length_request */ = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_scratch_size ( 0 )
{ init(league_size_request,team_size_request); }
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_scratch_size ( 0 )
{ init(league_size_request,execution_space::thread_pool_size(2)); }
template<class MemorySpace>
TeamPolicy( int league_size_request
, int team_size_request
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_scratch_size(scratch_request.total(team_size_request))
{ init(league_size_request,team_size_request); }
template<class MemorySpace>
TeamPolicy( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
: m_league_size(0)
, m_team_size(0)
, m_team_alloc(0)
, m_scratch_size(scratch_request.total(execution_space::thread_pool_size(2)))
{ init(league_size_request,execution_space::thread_pool_size(2)); }
typedef Impl::ThreadsExecTeamMember member_type ;

View File

@ -45,6 +45,7 @@
#define KOKKOS_THREADS_PARALLEL_HPP
#include <vector>
#include <iostream>
#include <Kokkos_Parallel.hpp>
@ -58,363 +59,440 @@ namespace Impl {
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/* ParallelFor Kokkos::Threads with RangePolicy */
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member ibeg , const Member iend )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i );
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i );
}
}
static void execute( ThreadsExec & exec , const void * arg )
static void exec( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
ParallelFor::template exec_range< WorkTag >
( self.m_functor , range.begin() , range.end() );
exec.fan_in();
}
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
ThreadsExec::start( & ParallelFor::execute , this );
ThreadsExec::start( & ParallelFor::exec , this );
ThreadsExec::fence();
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{}
};
//----------------------------------------------------------------------------
/* ParallelFor Kokkos::Threads with TeamPolicy */
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
class ParallelFor< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
>
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
const int m_shared ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( member ); }
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
for ( ; member.valid() ; member.next() ) {
functor( member );
}
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member ) const
{ m_func( TagType() , member ); }
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member )
{
const TagType t{} ;
for ( ; member.valid() ; member.next() ) {
functor( t , member );
}
}
static void execute( ThreadsExec & exec , const void * arg )
static void exec( ThreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
for ( ; member.valid() ; member.next() ) {
self.ParallelFor::template driver< typename Policy::work_tag >( member );
}
ParallelFor::exec_team< WorkTag >
( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) );
exec.fan_in();
}
public:
ParallelFor( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
inline
void execute() const
{
ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelFor::execute , this );
ThreadsExec::start( & ParallelFor::exec , this );
ThreadsExec::fence();
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ }
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and RangePolicy */
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef Kokkos::RangePolicy< Arg0 , Arg1, Arg2, Kokkos::Threads > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member & ibeg , const Member & iend
, reference_type update )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i , update );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member & ibeg , const Member & iend
, reference_type update )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update );
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i , update );
}
}
static void execute( ThreadsExec & exec , const void * arg )
static void exec( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
driver( self.m_func
, ValueInit::init( self.m_func , exec.reduce_memory() )
, typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
);
ParallelReduce::template exec_range< WorkTag >
( self.m_functor , range.begin() , range.end()
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
}
public:
template< class HostViewType >
ParallelReduce( const FunctorType & functor ,
const Policy & policy ,
const HostViewType & result_view )
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
ThreadsExec::start( & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
ThreadsExec::start( & ParallelReduce::exec , this );
ThreadsExec::fence();
if ( result_view.ptr_on_device() ) {
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
if ( m_result_ptr ) {
const pointer_type data =
(pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy ,
const HostViewType & arg_result_view )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() )
{
static_assert( Kokkos::is_view< HostViewType >::value
, "Kokkos::Threads reduce result must be a View" );
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Threads reduce result must be a View in HostSpace" );
}
};
//----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and TeamPolicy */
template< class FunctorType , class Arg0 , class Arg1 >
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
>
{
private:
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
const pointer_type m_result_ptr ;
const int m_shared ;
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( member , update ); }
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
for ( ; member.valid() ; member.next() ) {
functor( member , update );
}
}
template< class TagType >
KOKKOS_FORCEINLINE_FUNCTION
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
const typename Policy::member_type & >::type member
, reference_type update ) const
{ m_func( TagType() , member , update ); }
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
{
const TagType t{} ;
for ( ; member.valid() ; member.next() ) {
functor( t , member , update );
}
}
static void execute( ThreadsExec & exec , const void * arg )
static void exec( ThreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
// Initialize thread-local value
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
ParallelReduce::template exec_team< WorkTag >
( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
for ( ; member.valid() ; member.next() ) {
self.ParallelReduce::template driver< work_tag >( member , update );
}
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
}
public:
ParallelReduce( const FunctorType & functor
, const Policy & policy )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
inline
void execute() const
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::execute , this );
ThreadsExec::start( & ParallelReduce::exec , this );
ThreadsExec::fence();
if ( m_result_ptr ) {
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
}
}
template< class ViewType >
ParallelReduce( const FunctorType & functor
, const Policy & policy
, const ViewType & result )
: m_func( functor )
, m_policy( policy )
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
{
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::execute , this );
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
ThreadsExec::fence();
const unsigned n = ValueTraits::value_count( m_func );
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
}
ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy
, const ViewType & arg_result )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ }
};
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/* ParallelScan with Kokkos::Threads and RangePolicy */
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads >
>
{
private:
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
typedef typename Policy::work_tag work_tag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
typedef Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads > Policy ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_func ;
const FunctorType m_functor ;
const Policy m_policy ;
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member & ibeg , const Member & iend
, reference_type update , const bool final )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( i , update , final );
}
}
template< class PType >
KOKKOS_FORCEINLINE_FUNCTION static
void driver( typename Impl::enable_if<
( ! Impl::is_same< typename PType::work_tag , void >::value )
, const FunctorType & >::type functor
, reference_type update
, const bool final
, const PType & range )
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_range( const FunctorType & functor
, const Member & ibeg , const Member & iend
, reference_type update , const bool final )
{
const typename PType::member_type e = range.end();
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
functor( typename PType::work_tag() , i , update , final );
const TagType t{} ;
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
#pragma ivdep
#endif
for ( Member i = ibeg ; i < iend ; ++i ) {
functor( t , i , update , final );
}
}
static void execute( ThreadsExec & exec , const void * arg )
static void exec( ThreadsExec & exec , const void * arg )
{
const ParallelScan & self = * ((const ParallelScan *) arg );
const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
reference_type update =
ValueInit::init( self.m_functor , exec.reduce_memory() );
driver( self.m_func , update , false , range );
ParallelScan::template exec_range< WorkTag >
( self.m_functor , range.begin(), range.end(), update, false );
// exec.<FunctorType,work_tag>scan_large( self.m_func );
exec.template scan_small<FunctorType,work_tag>( self.m_func );
// exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
exec.template scan_small<FunctorType,WorkTag>( self.m_functor );
driver( self.m_func , update , true , range );
ParallelScan::template exec_range< WorkTag >
( self.m_functor , range.begin(), range.end(), update, true );
exec.fan_in();
}
public:
ParallelScan( const FunctorType & functor , const Policy & policy )
: m_func( functor )
, m_policy( policy )
inline
void execute() const
{
ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
ThreadsExec::start( & ParallelScan::execute , this );
ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
ThreadsExec::start( & ParallelScan::exec , this );
ThreadsExec::fence();
}
ParallelScan( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
{ }
};
} // namespace Impl

View File

@ -0,0 +1,18 @@
SET(HEADERS "")
SET(SOURCES "")
FILE(GLOB HEADERS *.hpp)
FILE(GLOB SOURCES *.cpp)
TRIBITS_ADD_LIBRARY(
kokkoscore_impl
NOINSTALLHEADERS ${HEADERS}
SOURCES ${SOURCES}
DEPLIBS
)
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)

View File

@ -47,6 +47,27 @@ namespace Kokkos {
namespace Experimental {
namespace Impl {
int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
{
// A host thread claim and disable tracking flag
while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
}
void SharedAllocationRecord< void , void >::tracking_release_and_enable()
{
// The host thread that claimed and disabled the tracking flag
// now release and enable tracking.
if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
}
}
//----------------------------------------------------------------------------
bool
SharedAllocationRecord< void , void >::
is_sane( SharedAllocationRecord< void , void > * arg_record )
@ -61,7 +82,7 @@ is_sane( SharedAllocationRecord< void , void > * arg_record )
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 );
while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == zero );
for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
const bool ok_non_null = rec && rec->m_prev && ( rec == root || rec->m_next );
@ -73,14 +94,25 @@ is_sane( SharedAllocationRecord< void , void > * arg_record )
ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
if ( ! ok ) {
fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n"
, reinterpret_cast< unsigned long >( rec )
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12llx){ m_count(%d) m_root(0x%.12llx) m_next(0x%.12llx) m_prev(0x%.12llx) m_next->m_prev(0x%.12llx) m_prev->m_next(0x%.12llx) }\n";
}
fprintf(stderr
, format_string
, reinterpret_cast< uintptr_t >( rec )
, rec->m_count
, reinterpret_cast< unsigned long >( rec->m_root )
, reinterpret_cast< unsigned long >( rec->m_next )
, reinterpret_cast< unsigned long >( rec->m_prev )
, reinterpret_cast< unsigned long >( rec->m_next->m_prev )
, reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
, reinterpret_cast< uintptr_t >( rec->m_root )
, reinterpret_cast< uintptr_t >( rec->m_next )
, reinterpret_cast< uintptr_t >( rec->m_prev )
, reinterpret_cast< uintptr_t >( rec->m_next->m_prev )
, reinterpret_cast< uintptr_t >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
);
}
@ -102,7 +134,7 @@ SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * con
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 );
while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , zero ) ) == zero );
// Iterate searching for the record with this data pointer
@ -148,7 +180,7 @@ SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
m_prev = m_root ;
// Read root->m_next and lock by setting to zero
while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 );
while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == zero );
m_next->m_prev = this ;
@ -187,7 +219,7 @@ decrement( SharedAllocationRecord< void , void > * arg_record )
SharedAllocationRecord * root_next = 0 ;
// Lock the list:
while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 );
while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , zero ) ) == zero );
arg_record->m_next->m_prev = arg_record->m_prev ;
@ -232,16 +264,26 @@ print_host_accessible_records( std::ostream & s
if ( detail ) {
do {
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
}
snprintf( buffer , 256
, format_string
, space_name
, reinterpret_cast<unsigned long>( r )
, reinterpret_cast<unsigned long>( r->m_prev )
, reinterpret_cast<unsigned long>( r->m_next )
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
, reinterpret_cast<uintptr_t>( r )
, reinterpret_cast<uintptr_t>( r->m_prev )
, reinterpret_cast<uintptr_t>( r->m_next )
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
, r->m_alloc_size
, r->m_count
, reinterpret_cast<unsigned long>( r->m_dealloc )
, reinterpret_cast<uintptr_t>( r->m_dealloc )
, r->m_alloc_ptr->m_label
);
std::cout << buffer ;
@ -251,10 +293,20 @@ print_host_accessible_records( std::ostream & s
else {
do {
if ( r->m_alloc_ptr ) {
//Formatting dependent on sizeof(uintptr_t)
const char * format_string;
snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n"
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
format_string = "%s [ 0x%.12lx + %ld ] %s\n";
}
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
format_string = "%s [ 0x%.12llx + %ld ] %s\n";
}
snprintf( buffer , 256
, format_string
, space_name
, reinterpret_cast< unsigned long >( r->data() )
, reinterpret_cast< uintptr_t >( r->data() )
, r->size()
, r->m_alloc_ptr->m_label
);

View File

@ -41,6 +41,9 @@
//@HEADER
*/
#ifndef KOKKOS_SHARED_ALLOC_HPP_
#define KOKKOS_SHARED_ALLOC_HPP_
namespace Kokkos {
namespace Experimental {
namespace Impl {
@ -78,6 +81,8 @@ protected:
typedef void (* function_type )( SharedAllocationRecord<void,void> * );
static int s_tracking_enabled ;
SharedAllocationHeader * const m_alloc_ptr ;
size_t const m_alloc_size ;
function_type const m_dealloc ;
@ -100,6 +105,18 @@ protected:
public:
static int tracking_enabled() { return s_tracking_enabled ; }
/**\brief A host process thread claims and disables the
* shared allocation tracking flag.
*/
static void tracking_claim_and_disable();
/**\brief A host process thread releases and enables the
* shared allocation tracking flag.
*/
static void tracking_release_and_enable();
~SharedAllocationRecord() = default ;
constexpr SharedAllocationRecord()
@ -148,6 +165,25 @@ public:
, const bool detail );
};
namespace {
/* Taking the address of this function so make sure it is unique */
template < class MemorySpace , class DestroyFunctor >
void deallocate( SharedAllocationRecord<void,void> * record_ptr )
{
typedef SharedAllocationRecord< MemorySpace , void > base_type ;
typedef SharedAllocationRecord< MemorySpace , DestroyFunctor > this_type ;
this_type * const ptr = static_cast< this_type * >(
static_cast< base_type * >( record_ptr ) );
ptr->m_destroy.destroy_shared_allocation();
delete ptr ;
}
}
/*
* Memory space specialization of SharedAllocationRecord< Space , void > requires :
*
@ -158,25 +194,23 @@ public:
* Space m_space ;
* }
*/
template< class MemorySpace , class DestroyFunctor >
class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
{
private:
static void deallocate( SharedAllocationRecord<void,void> * record_ptr )
{ delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); }
SharedAllocationRecord( const MemorySpace & arg_space
, const std::string & arg_label
, const size_t arg_alloc
)
/* Allocate user memory as [ SharedAllocationHeader , user_memory ] */
: SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate )
: SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & Kokkos::Experimental::Impl::deallocate< MemorySpace , DestroyFunctor > )
, m_destroy()
{}
~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); }
SharedAllocationRecord() = delete ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
public:
@ -204,42 +238,48 @@ private:
typedef SharedAllocationRecord<void,void> Record ;
enum : unsigned long {
DO_NOT_DEREF_FLAG = 0x01ul
};
enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
// The allocation record resides in Host memory space
Record * m_record ;
unsigned long m_record_bits;
KOKKOS_INLINE_FUNCTION
static Record * disable( Record * rec )
{ return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); }
KOKKOS_INLINE_FUNCTION
void increment() const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
#endif
}
KOKKOS_INLINE_FUNCTION
void decrement() const
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
#endif
}
Record * m_record ;
uintptr_t m_record_bits ;
public:
KOKKOS_INLINE_FUNCTION
constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {}
// Use macros instead of inline functions to reduce
// pressure on compiler optimization by reducing
// number of symbols and inline functons.
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED \
Record::tracking_enabled()
#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT \
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT \
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
#else
#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED 0
#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT /* */
#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT /* */
#endif
/** \brief Assign a specialized record */
inline
void assign_allocated_record_to_uninitialized( Record * arg_record )
{ Record::increment( m_record = arg_record ); }
template< class MemorySpace >
constexpr
SharedAllocationRecord< MemorySpace , void > & get_record() const
SharedAllocationRecord< MemorySpace , void > &
get_record() const
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
template< class MemorySpace >
@ -252,36 +292,92 @@ public:
}
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( Record * arg_record )
: m_record( arg_record ) { increment(); }
KOKKOS_INLINE_FUNCTION
~SharedAllocationTracker() { decrement(); }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( const SharedAllocationTracker & rhs )
: m_record( rhs.m_record ) { increment(); }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker( SharedAllocationTracker && rhs )
: m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
KOKKOS_INLINE_FUNCTION
SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
int use_count() const
{
decrement();
m_record = rhs.m_record ;
increment();
return *this ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
Record * const tmp = reinterpret_cast<Record*>( m_record_bits & ~DO_NOT_DEREF_FLAG );
return ( tmp ? tmp->use_count() : 0 );
#else
return 0 ;
#endif
}
KOKKOS_INLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION
~SharedAllocationTracker()
{ KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT }
KOKKOS_FORCEINLINE_FUNCTION
constexpr SharedAllocationTracker()
: m_record_bits( DO_NOT_DEREF_FLAG ) {}
// Move:
KOKKOS_FORCEINLINE_FUNCTION
SharedAllocationTracker( SharedAllocationTracker && rhs )
: m_record_bits( rhs.m_record_bits )
{ rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
KOKKOS_FORCEINLINE_FUNCTION
SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
{
m_record = rhs.m_record ;
// If this is tracking then must decrement
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
// Move and reset RHS to default constructed value.
m_record_bits = rhs.m_record_bits ;
rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
return *this ;
}
// Copy:
KOKKOS_FORCEINLINE_FUNCTION
SharedAllocationTracker( const SharedAllocationTracker & rhs )
: m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
? rhs.m_record_bits
: rhs.m_record_bits | DO_NOT_DEREF_FLAG )
{
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
}
/** \brief Copy construction may disable tracking. */
KOKKOS_FORCEINLINE_FUNCTION
SharedAllocationTracker( const SharedAllocationTracker & rhs
, const bool enable_tracking )
: m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
&& enable_tracking
? rhs.m_record_bits
: rhs.m_record_bits | DO_NOT_DEREF_FLAG )
{ KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT }
KOKKOS_FORCEINLINE_FUNCTION
SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
{
// If this is tracking then must decrement
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
? rhs.m_record_bits
: rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
return *this ;
}
/** \brief Copy assignment may disable tracking */
KOKKOS_FORCEINLINE_FUNCTION
void assign( const SharedAllocationTracker & rhs
, const bool enable_tracking )
{
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
&& enable_tracking
? rhs.m_record_bits
: rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
}
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
};
@ -289,4 +385,4 @@ public:
} /* namespace Experimental */
} /* namespace Kokkos */
#endif

View File

@ -47,6 +47,28 @@
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace Kokkos {
/* For backward compatibility */
struct ViewAllocateWithoutInitializing {
const std::string label ;
ViewAllocateWithoutInitializing() : label() {}
ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
ViewAllocateWithoutInitializing( const char * const arg_label ) : label( arg_label ) {}
};
} /* namespace Kokkos */
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {

View File

@ -50,8 +50,8 @@ namespace Kokkos {
namespace Experimental {
namespace Impl {
template< class DataType , class V , long N , class P , class ArrayLayout >
struct ViewDataAnalysis< DataType , Kokkos::Array<V,N,P> , ArrayLayout >
template< class DataType , class ArrayLayout , class V , size_t N , class P >
struct ViewDataAnalysis< DataType , ArrayLayout , Kokkos::Array<V,N,P> >
{
private:
@ -73,15 +73,7 @@ private:
, typename array_analysis::const_value_type
>::value };
typedef ViewDimension< ( dimension::rank == 0 ? N : dimension::arg_N0 )
, ( dimension::rank == 1 ? N : dimension::arg_N1 )
, ( dimension::rank == 2 ? N : dimension::arg_N2 )
, ( dimension::rank == 3 ? N : dimension::arg_N3 )
, ( dimension::rank == 4 ? N : dimension::arg_N4 )
, ( dimension::rank == 5 ? N : dimension::arg_N5 )
, ( dimension::rank == 6 ? N : dimension::arg_N6 )
, ( dimension::rank == 7 ? N : dimension::arg_N7 )
> array_scalar_dimension ;
typedef typename dimension::template append<N>::type array_scalar_dimension ;
typedef typename std::conditional< is_const , const V , V >::type scalar_type ;
typedef V non_const_scalar_type ;
@ -113,18 +105,18 @@ namespace Impl {
/** \brief View mapping for non-specialized data type and standard layout */
template< class Traits >
class ViewMapping< Traits , void ,
typename std::enable_if<( std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
)>::type >
class ViewMapping< Traits ,
typename std::enable_if<(
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
)>::type >
{
private:
template< class , class , typename > friend class ViewMapping ;
template< class , bool , bool , bool , bool , bool , bool , bool , bool , class > friend struct SubviewMapping ;
template< class , class , class , class > friend class Kokkos::Experimental::View ;
template< class , class ... > friend class ViewMapping ;
template< class , class ... > friend class Kokkos::Experimental::View ;
typedef ViewOffset< typename Traits::dimension
, typename Traits::array_layout
@ -187,16 +179,20 @@ public:
// Range span
/** \brief Span of the mapped range */
KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); }
KOKKOS_INLINE_FUNCTION constexpr size_t span() const
{ return m_offset.span() * Array_N ; }
/** \brief Is the mapped range span contiguous */
KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); }
KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
{ return m_offset.span_is_contiguous(); }
typedef typename std::conditional< is_contiguous_reference , contiguous_reference , strided_reference >::type reference_type ;
typedef handle_type pointer_type ;
/** \brief If data references are lvalue_reference than can query pointer to memory */
KOKKOS_INLINE_FUNCTION constexpr typename Traits::value_type * data() const
{ return (typename Traits::value_type *) 0 ; }
KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
{ return m_handle ; }
//----------------------------------------
// The View class performs all rank and bounds checking before
@ -259,14 +255,14 @@ public:
private:
enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
enum { MemorySpanSize = sizeof(typename Traits::value_type) };
enum { MemorySpanSize = sizeof(scalar_type) };
public:
/** \brief Span, in bytes, of the referenced memory */
KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
{
return ( m_stride * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask);
return ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
}
/** \brief Span, in bytes, of the required memory */
@ -277,7 +273,7 @@ public:
, const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
{
typedef std::integral_constant< unsigned , AllowPadding ? MemorySpanSize : 0 > padding ;
return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
}
/** \brief Span, in bytes, of the required memory */
@ -286,7 +282,7 @@ public:
static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> &
, const typename Traits::array_layout & layout )
{
return ( offset_type( layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
return ( offset_type( layout ).span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
}
//----------------------------------------
@ -305,11 +301,11 @@ public:
template< bool AllowPadding >
KOKKOS_INLINE_FUNCTION
ViewMapping( void * ptr
ViewMapping( pointer_type ptr
, const std::integral_constant<bool,AllowPadding> &
, const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3
, const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
: m_handle( reinterpret_cast< handle_type >( ptr ) )
: m_handle( ptr )
, m_offset( std::integral_constant< unsigned , AllowPadding ? sizeof(typename Traits::value_type) : 0 >()
, N0, N1, N2, N3, N4, N5, N6, N7 )
, m_stride( m_offset.span() )
@ -317,10 +313,10 @@ public:
template< bool AllowPadding >
KOKKOS_INLINE_FUNCTION
ViewMapping( void * ptr
ViewMapping( pointer_type ptr
, const std::integral_constant<bool,AllowPadding> &
, const typename Traits::array_layout & layout )
: m_handle( reinterpret_cast< handle_type >( ptr ) )
: m_handle( ptr )
, m_offset( layout )
, m_stride( m_offset.span() )
{}
@ -340,7 +336,8 @@ public:
{
typedef Kokkos::RangePolicy< ExecSpace , size_t > Policy ;
(void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_stride ) );
const Kokkos::Impl::ParallelFor< ViewMapping , Policy > closure( *this , Policy( 0 , m_stride ) );
closure.execute();
ExecSpace::fence();
}
@ -379,8 +376,8 @@ public:
enum { is_assignable = true };
typedef Kokkos::Experimental::Impl::SharedAllocationTracker TrackType ;
typedef ViewMapping< DstTraits , void , void > DstType ;
typedef ViewMapping< SrcTraits , void , void > SrcType ;
typedef ViewMapping< DstTraits , void > DstType ;
typedef ViewMapping< SrcTraits , void > SrcType ;
KOKKOS_INLINE_FUNCTION
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
@ -438,8 +435,8 @@ public:
std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value };
typedef Kokkos::Experimental::Impl::SharedAllocationTracker TrackType ;
typedef ViewMapping< DstTraits , void , void > DstType ;
typedef ViewMapping< SrcTraits , void , void > SrcType ;
typedef ViewMapping< DstTraits , void > DstType ;
typedef ViewMapping< SrcTraits , void > SrcType ;
KOKKOS_INLINE_FUNCTION
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
@ -452,6 +449,7 @@ public:
// Arguments beyond the destination rank are ignored.
if ( src.span_is_contiguous() ) { // not padded
dst.m_offset = dst_offset_type( std::integral_constant<unsigned,0>()
, ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
, ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
, ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
, ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
@ -483,34 +481,47 @@ public:
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief View mapping for non-specialized data type and standard layout */
template< class Traits , bool R0 , bool R1 , bool R2 , bool R3 , bool R4 , bool R5 , bool R6 , bool R7 >
struct SubviewMapping< Traits, R0, R1, R2, R3, R4, R5, R6, R7 ,
typename std::enable_if<(
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value
&&
(
std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
)
)>::type >
template< class SrcTraits , class ... Args >
struct ViewMapping
< typename std::enable_if<(
std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
&&
(
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
)
)>::type
, SrcTraits
, Args ... >
{
private:
// Subview's rank
static_assert( SrcTraits::rank == sizeof...(Args) , "" );
enum : bool
{ R0 = is_integral_extent<0,Args...>::value
, R1 = is_integral_extent<1,Args...>::value
, R2 = is_integral_extent<2,Args...>::value
, R3 = is_integral_extent<3,Args...>::value
, R4 = is_integral_extent<4,Args...>::value
, R5 = is_integral_extent<5,Args...>::value
, R6 = is_integral_extent<6,Args...>::value
, R7 = is_integral_extent<7,Args...>::value
};
enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+ unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
// Whether right-most rank is a range.
enum { R0_rev = 0 == Traits::rank ? false : (
1 == Traits::rank ? R0 : (
2 == Traits::rank ? R1 : (
3 == Traits::rank ? R2 : (
4 == Traits::rank ? R3 : (
5 == Traits::rank ? R4 : (
6 == Traits::rank ? R5 : (
7 == Traits::rank ? R6 : R7 ))))))) };
enum { R0_rev = 0 == SrcTraits::rank ? false : (
1 == SrcTraits::rank ? R0 : (
2 == SrcTraits::rank ? R1 : (
3 == SrcTraits::rank ? R2 : (
4 == SrcTraits::rank ? R3 : (
5 == SrcTraits::rank ? R4 : (
6 == SrcTraits::rank ? R5 : (
7 == SrcTraits::rank ? R6 : R7 ))))))) };
// Subview's layout
typedef typename std::conditional<
@ -519,15 +530,15 @@ private:
||
// OutputRank 1 or 2, InputLayout Left, Interval 0
// because single stride one or second index has a stride.
( rank <= 2 && R0 && std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value )
( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value )
||
// OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
// because single stride one or second index has a stride.
( rank <= 2 && R0_rev && std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value )
), typename Traits::array_layout , Kokkos::LayoutStride
( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value )
), typename SrcTraits::array_layout , Kokkos::LayoutStride
>::type array_layout ;
typedef typename Traits::value_type value_type ;
typedef typename SrcTraits::value_type value_type ;
typedef typename std::conditional< rank == 0 , value_type ,
typename std::conditional< rank == 1 , value_type * ,
@ -543,66 +554,41 @@ private:
public:
typedef
Kokkos::Experimental::ViewTraits< data_type , array_layout
, typename Traits::device_type
, typename Traits::memory_traits > traits_type ;
typedef Kokkos::Experimental::ViewTraits
< data_type
, array_layout
, typename SrcTraits::device_type
, typename SrcTraits::memory_traits > traits_type ;
typedef Kokkos::Experimental::View< data_type
, array_layout
, typename Traits::device_type
, typename Traits::memory_traits > type ;
typedef Kokkos::Experimental::View
< data_type
, array_layout
, typename SrcTraits::device_type
, typename SrcTraits::memory_traits > type ;
template< class T0 , class T1 , class T2 , class T3
, class T4 , class T5 , class T6 , class T7 >
KOKKOS_INLINE_FUNCTION
static void assign( ViewMapping< traits_type , void , void > & dst
, ViewMapping< Traits , void , void > const & src
, T0 const & arg0
, T1 const & arg1
, T2 const & arg2
, T3 const & arg3
, T4 const & arg4
, T5 const & arg5
, T6 const & arg6
, T7 const & arg7
)
static void assign( ViewMapping< traits_type , void > & dst
, ViewMapping< SrcTraits , void > const & src
, Args ... args )
{
typedef ViewMapping< traits_type , void , void > DstType ;
typedef ViewMapping< traits_type , void > DstType ;
typedef typename DstType::offset_type dst_offset_type ;
typedef typename DstType::handle_type dst_handle_type ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T0> V0 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T1> V1 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T2> V2 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T3> V3 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T4> V4 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T5> V5 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T6> V6 ;
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T7> V7 ;
dst.m_offset = dst_offset_type
( src.m_offset
, V0::dimension( src.m_offset.dimension_0() , arg0 )
, V1::dimension( src.m_offset.dimension_1() , arg1 )
, V2::dimension( src.m_offset.dimension_2() , arg2 )
, V3::dimension( src.m_offset.dimension_3() , arg3 )
, V4::dimension( src.m_offset.dimension_4() , arg4 )
, V5::dimension( src.m_offset.dimension_5() , arg5 )
, V6::dimension( src.m_offset.dimension_6() , arg6 )
, V7::dimension( src.m_offset.dimension_7() , arg7 )
);
const SubviewExtents< SrcTraits::rank , rank >
extents( src.m_offset.m_dim , args... );
dst.m_offset = dst_offset_type( src.m_offset , extents );
dst.m_handle = dst_handle_type( src.m_handle +
src.m_offset( V0::begin( arg0 )
, V1::begin( arg1 )
, V2::begin( arg2 )
, V3::begin( arg3 )
, V4::begin( arg4 )
, V5::begin( arg5 )
, V6::begin( arg6 )
, V7::begin( arg7 )
src.m_offset( extents.domain_offset(0)
, extents.domain_offset(1)
, extents.domain_offset(2)
, extents.domain_offset(3)
, extents.domain_offset(4)
, extents.domain_offset(5)
, extents.domain_offset(6)
, extents.domain_offset(7)
) );
}
};

File diff suppressed because it is too large Load Diff

View File

@ -69,8 +69,8 @@ struct ViewOffset< Dimension , Layout ,
{
public:
enum { SHIFT_0 = Kokkos::Impl::power_of_two<Layout::N0>::value };
enum { SHIFT_1 = Kokkos::Impl::power_of_two<Layout::N1>::value };
enum { SHIFT_0 = Kokkos::Impl::integral_power_of_two(Layout::N0) };
enum { SHIFT_1 = Kokkos::Impl::integral_power_of_two(Layout::N1) };
enum { SHIFT_T = SHIFT_0 + SHIFT_1 };
enum { MASK_0 = Layout::N0 - 1 };
enum { MASK_1 = Layout::N1 - 1 };
@ -155,6 +155,42 @@ public:
{}
};
template< typename T , unsigned N0 , unsigned N1 , class ... P
, typename iType0 , typename iType1
>
struct ViewMapping
< void
, Kokkos::Experimental::ViewTraits<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...>
, Kokkos::LayoutTileLeft<N0,N1,true>
, iType0
, iType1 >
{
typedef Kokkos::LayoutTileLeft<N0,N1,true> src_layout ;
typedef Kokkos::Experimental::ViewTraits< T** , src_layout , P... > src_traits ;
typedef Kokkos::Experimental::ViewTraits< T[N0][N1] , LayoutLeft , P ... > traits ;
typedef Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P ... > type ;
KOKKOS_INLINE_FUNCTION static
void assign( ViewMapping< traits , void > & dst
, const ViewMapping< src_traits , void > & src
, const src_layout &
, const size_t i_tile0
, const size_t i_tile1
)
{
typedef ViewMapping< traits , void > dst_map_type ;
typedef ViewMapping< src_traits , void > src_map_type ;
typedef typename dst_map_type::handle_type dst_handle_type ;
typedef typename dst_map_type::offset_type dst_offset_type ;
typedef typename src_map_type::offset_type src_offset_type ;
dst = dst_map_type(
dst_handle_type( src.m_handle +
( ( i_tile0 + src.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
dst_offset_type() );
}
};
} /* namespace Impl */
} /* namespace Experimental */
} /* namespace Kokkos */
@ -162,51 +198,20 @@ public:
namespace Kokkos {
namespace Experimental {
// Using View with an invalid data type to construct the tiling subview.
// View is a friend of View so we use this invalid data type partial specialization
// to access implementation of both source and destination view for constructing
// the tile subview.
template< unsigned N0 , unsigned N1 >
struct View< void , Kokkos::LayoutTileLeft<N0,N1,true> , void , void >
{
typedef Kokkos::LayoutTileLeft<N0,N1,true> Layout ;
template< typename T , class A2 , class A3 >
KOKKOS_INLINE_FUNCTION static
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 >
tile_subview( const Kokkos::Experimental::View<T**,Layout,A2,A3> & src
, const size_t i_tile0
, const size_t i_tile1
)
{
typedef Kokkos::Experimental::View<T**,Layout,A2,A3> SrcView ;
typedef Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 > DstView ;
typedef typename SrcView::map_type::offset_type src_offset_type ;
typedef typename DstView::map_type dst_map_type ;
typedef typename DstView::map_type::handle_type dst_handle_type ;
typedef typename DstView::map_type::offset_type dst_offset_type ;
return DstView( src.m_track ,
dst_map_type(
dst_handle_type( src.m_map.m_handle +
( ( i_tile0 + src.m_map.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
dst_offset_type() )
);
}
};
template< typename T , unsigned N0 , unsigned N1 , class A2 , class A3 >
template< typename T , unsigned N0 , unsigned N1 , class ... P >
KOKKOS_INLINE_FUNCTION
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 >
tile_subview( const Kokkos::Experimental::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,A2,A3> & src
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
tile_subview( const Kokkos::Experimental::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> & src
, const size_t i_tile0
, const size_t i_tile1
)
{
return View< void , Kokkos::LayoutTileLeft<N0,N1,true> , void , void >::
tile_subview( src , i_tile0 , i_tile1 );
// Force the specialized ViewMapping for extracting a tile
// by using the first subview argument as the layout.
typedef Kokkos::LayoutTileLeft<N0,N1,true> SrcLayout ;
return Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
( src , SrcLayout() , i_tile0 , i_tile1 );
}
} /* namespace Experimental */

View File

@ -43,6 +43,8 @@
#include <Kokkos_Core_fwd.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <Kokkos_Atomic.hpp>
@ -842,3 +844,5 @@ void * create_singleton( size_t size
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */

View File

@ -46,6 +46,8 @@
#include <Kokkos_Macros.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
@ -351,7 +353,6 @@ public:
//-----------------------------------------------------------------------------
// forward declaration for friend classes
struct CopyWithoutTracking;
struct MallocHelper;
/// class AllocationTracker
@ -544,6 +545,10 @@ public:
/// NOT thread-safe
void reallocate( size_t size ) const;
static void disable_tracking();
static void enable_tracking();
static bool tracking_enabled();
private:
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
@ -556,31 +561,14 @@ private:
void increment_ref_count() const;
void decrement_ref_count() const;
static void disable_tracking();
static void enable_tracking();
static bool tracking_enabled();
friend struct Impl::CopyWithoutTracking;
friend struct Impl::MallocHelper;
uintptr_t m_alloc_rec;
};
/// Make a copy of the functor with reference counting disabled
struct CopyWithoutTracking
{
template <typename Functor>
static Functor apply( const Functor & f )
{
AllocationTracker::disable_tracking();
Functor func(f);
AllocationTracker::enable_tracking();
return func;
}
};
}} // namespace Kokkos::Impl
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
#endif //KOKKOS_ALLOCATION_TRACKER_HPP

View File

@ -427,6 +427,8 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
typedef int64_t type;
};
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
template<class ViewTraits>
class ViewDataHandle<
@ -457,6 +459,8 @@ public:
}
};
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
}} // namespace Kokkos::Impl
#endif

View File

@ -45,6 +45,7 @@
#ifdef _WIN32
#define NOMINMAX
#include <winsock2.h>
#include <Windows.h>
namespace Kokkos {
@ -61,7 +62,6 @@ namespace Kokkos {
};
}
#ifdef KOKKOS_HAVE_CXX11
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange(volatile T * const dest, const T & compare,
@ -103,10 +103,18 @@ namespace Kokkos {
KOKKOS_INLINE_FUNCTION U() {};
} tmp, newval;
newval.t = val;
tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare));
_InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, ((LONGLONG*)&compare));
tmp.t = dest;
return tmp.t;
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val)
{
return atomic_compare_exchange(dest,compare,val);
}
template< typename T >
T atomic_fetch_or(volatile T * const dest, const T val) {
T oldval = *dest;
@ -147,7 +155,20 @@ namespace Kokkos {
}
template< typename T >
T atomic_fetch_exchange(volatile T * const dest, const T val) {
T atomic_fetch_sub(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
assume = oldval;
T newval = val - oldval;
oldval = atomic_compare_exchange(dest, assume, newval);
} while (assume != oldval);
return oldval;
}
template< typename T >
T atomic_exchange(volatile T * const dest, const T val) {
T oldval = *dest;
T assume;
do {
@ -174,8 +195,8 @@ namespace Kokkos {
}
template< typename T >
void atomic_exchange(volatile T * const dest, const T val) {
atomic_fetch_exchange(dest, val);
void atomic_sub(volatile T * const dest, const T val) {
atomic_fetch_sub(dest, val);
}
template< typename T >
@ -208,4 +229,4 @@ namespace Kokkos {
}
#endif
#endif
#endif

View File

@ -43,6 +43,8 @@
#include <Kokkos_HostSpace.hpp>
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
@ -50,8 +52,11 @@
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#endif
#include <sstream>
@ -103,8 +108,7 @@ void * raw_aligned_allocate( size_t size, size_t alignment )
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
ptr = _mm_malloc( size , alignment );
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
posix_memalign( & ptr, alignment , size );
@ -136,8 +140,7 @@ void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
_mm_free( ptr );
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
free( ptr );
#else
// get the alloc'd pointer
@ -279,3 +282,6 @@ void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t
}
}} // namespace Kokkos::Impl
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */

View File

@ -44,6 +44,7 @@
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
#define KOKKOS_BASIC_ALLOCATORS_HPP
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
namespace Kokkos { namespace Impl {
@ -113,6 +114,8 @@ public:
}} // namespace Kokkos::Impl
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
#endif //KOKKOS_BASIC_ALLOCATORS_HPP

View File

@ -67,6 +67,13 @@ bool is_unsigned_int(const char* str)
void initialize_internal(const InitArguments& args)
{
// This is an experimental setting
// For KNL in Flat mode this variable should be set, so that
// memkind allocates high bandwidth memory correctly.
#ifdef KOKKOS_HAVE_HBWSPACE
setenv("MEMKIND_HBW_NODES", "1", 0);
#endif
// Protect declarations, to prevent "unused variable" warnings.
#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
const int num_threads = args.num_threads;

View File

@ -61,7 +61,7 @@ void host_abort( const char * const message )
{
fwrite(message,1,strlen(message),stderr);
fflush(stderr);
abort();
::abort();
}
void throw_runtime_exception( const std::string & msg )

View File

@ -46,6 +46,10 @@
#include <string>
#include <iosfwd>
#include <KokkosCore_config.h>
#ifdef KOKKOS_HAVE_CUDA
#include <Cuda/Kokkos_Cuda_abort.hpp>
#endif
namespace Kokkos {
namespace Impl {

View File

@ -0,0 +1,108 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_HBWAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#endif
#include <sstream>
#include <iostream>
#ifdef KOKKOS_HAVE_HBWSPACE
#include <memkind.h>
namespace Kokkos {
namespace Experimental {
namespace Impl {
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
/*--------------------------------------------------------------------------*/
void* HBWMallocAllocator::allocate( size_t size )
{
std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl;
void * ptr = NULL;
if (size) {
ptr = memkind_malloc(MEMKIND_TYPE,size);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
if (ptr) {
memkind_free(MEMKIND_TYPE,ptr);
}
}
void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
{
void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size);
if (new_size > 0u && ptr == NULL) {
Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
}
return ptr;
}
} // namespace Impl
} // namespace Experimental
} // namespace Kokkos
#endif

View File

@ -0,0 +1,75 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_HBW_ALLOCATORS_HPP
#define KOKKOS_HBW_ALLOCATORS_HPP
#ifdef KOKKOS_HAVE_HBWSPACE
namespace Kokkos {
namespace Experimental {
namespace Impl {
/// class MallocAllocator
class HBWMallocAllocator
{
public:
static const char * name()
{
return "HBW Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}
}
} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_HBWSPACE
#endif //KOKKOS_HBW_ALLOCATORS_HPP

Some files were not shown because too many files have changed in this diff Show More