Snapshot of kokkos.git from commit 0a776f65e7429b875839719c4fe528c15e871e46
From repository at git@github.com:/kokkos/kokkos.git
At commit:
commit 0a776f65e7429b875839719c4fe528c15e871e46
Author: crtrott <crtrott@sandia.gov>
Date: Thu Dec 10 11:51:50 2015 -0700
Adding CUDA 7.5 as secondary compiler to README
This commit is contained in:
8
lib/kokkos/.gitignore
vendored
Normal file
8
lib/kokkos/.gitignore
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Standard ignores
|
||||
*~
|
||||
*.pyc
|
||||
\#*#
|
||||
.#*
|
||||
.*.swp
|
||||
.cproject
|
||||
.project
|
||||
123
lib/kokkos/CMakeLists.txt
Normal file
123
lib/kokkos/CMakeLists.txt
Normal file
@ -0,0 +1,123 @@
|
||||
|
||||
#
|
||||
# A) Forward delcare the package so that certain options are also defined for
|
||||
# subpackages
|
||||
#
|
||||
|
||||
TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
#
|
||||
# B) Define the common options for Kokkos first so they can be used by
|
||||
# subpackages as well.
|
||||
#
|
||||
|
||||
TRIBITS_ADD_DEBUG_OPTION()
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_SIERRA_BUILD
|
||||
KOKKOS_FOR_SIERRA
|
||||
"Configure Kokkos for building within the Sierra build system."
|
||||
OFF
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Cuda
|
||||
KOKKOS_HAVE_CUDA
|
||||
"Enable CUDA support in Kokkos."
|
||||
"${TPL_ENABLE_CUDA}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Cuda_UVM
|
||||
KOKKOS_USE_CUDA_UVM
|
||||
"Enable CUDA Unified Virtual Memory support in Kokkos."
|
||||
OFF
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Pthread
|
||||
KOKKOS_HAVE_PTHREAD
|
||||
"Enable Pthread support in Kokkos."
|
||||
"${TPL_ENABLE_Pthread}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_OpenMP
|
||||
KOKKOS_HAVE_OPENMP
|
||||
"Enable OpenMP support in Kokkos."
|
||||
"${${PROJECT_NAME}_ENABLE_OpenMP}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_QTHREAD
|
||||
KOKKOS_HAVE_QTHREAD
|
||||
"Enable QTHREAD support in Kokkos."
|
||||
"${TPL_ENABLE_QTHREAD}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_CXX11
|
||||
KOKKOS_HAVE_CXX11
|
||||
"Enable C++11 support in Kokkos."
|
||||
"${${PROJECT_NAME}_ENABLE_CXX11}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_HWLOC
|
||||
KOKKOS_HAVE_HWLOC
|
||||
"Enable HWLOC support in Kokkos."
|
||||
"${TPL_ENABLE_HWLOC}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_MPI
|
||||
KOKKOS_HAVE_MPI
|
||||
"Enable MPI support in Kokkos."
|
||||
"${TPL_ENABLE_MPI}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Debug_Bounds_Check
|
||||
KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
"Enable bounds checking support in Kokkos."
|
||||
OFF
|
||||
)
|
||||
|
||||
#TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
# Kokkos_ENABLE_Profiling_Collect_Kernel_Data
|
||||
# KOKKOS_ENABLE_PROFILING_COLLECT_KERNEL_DATA
|
||||
# "Enable profiling support for kernel data collections in Kokkos."
|
||||
# "${${PROJECT_NAME}_ENABLE_KokkosProfiler}"
|
||||
# )
|
||||
|
||||
# placeholder for future device...
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Winthread
|
||||
KOKKOS_HAVE_WINTHREAD
|
||||
"Enable Winthread support in Kokkos."
|
||||
"${TPL_ENABLE_Winthread}"
|
||||
)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
#
|
||||
# C) Process the subpackages for Kokkos
|
||||
#
|
||||
|
||||
TRIBITS_PROCESS_SUBPACKAGES()
|
||||
|
||||
#
|
||||
# D) If Kokkos itself is enabled, process the Kokkos package
|
||||
#
|
||||
|
||||
TRIBITS_PACKAGE_DEF()
|
||||
|
||||
TRIBITS_EXCLUDE_AUTOTOOLS_FILES()
|
||||
|
||||
TRIBITS_EXCLUDE_FILES(
|
||||
classic/doc
|
||||
classic/LinAlg/doc/CrsRefactorNotesMay2012
|
||||
)
|
||||
|
||||
TRIBITS_PACKAGE_POSTPROCESS()
|
||||
|
||||
73
lib/kokkos/HOW_TO_SNAPSHOT
Normal file
73
lib/kokkos/HOW_TO_SNAPSHOT
Normal file
@ -0,0 +1,73 @@
|
||||
|
||||
Developers of Kokkos (those who commit modifications to Kokkos)
|
||||
must maintain the snapshot of Kokkos in the Trilinos repository.
|
||||
|
||||
This file contains instructions for how to
|
||||
snapshot Kokkos from github.com/kokkos to Trilinos.
|
||||
|
||||
------------------------------------------------------------------------
|
||||
*** EVERYTHING GOES RIGHT WORKFLOW ***
|
||||
|
||||
1) Given a 'git clone' of Kokkos and of Trilinos repositories.
|
||||
1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone.
|
||||
This path *must* terminate with the directory name 'kokkos';
|
||||
e.g., ${HOME}/kokkos .
|
||||
1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory.
|
||||
|
||||
2) Given that the Kokkos build & test is clean and
|
||||
changes are committed to the Kokkos clone.
|
||||
|
||||
3) Snapshot the current commit in the Kokkos clone into the Trilinos clone.
|
||||
This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}:
|
||||
${KOKKOS}/config/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages
|
||||
|
||||
4) Verify the snapshot commit happened as expected
|
||||
cd ${TRILINOS}/packages/kokkos
|
||||
git log -1 --name-only
|
||||
|
||||
5) Modify, build, and test Trilinos with the Kokkos snapshot.
|
||||
|
||||
6) Given that that the Trilinos build & test is clean and
|
||||
changes are committed to the Trilinos clone.
|
||||
|
||||
7) Attempt push to the Kokkos repository.
|
||||
If push fails then you must 'remove the Kokkos snapshot'
|
||||
from your Trilinos clone.
|
||||
See below.
|
||||
|
||||
8) Attempt to push to the Trilinos repository.
|
||||
If updating for a failed push requires you to change Kokkos you must
|
||||
'remove the Kokkos snapshot' from your Trilinos clone.
|
||||
See below.
|
||||
|
||||
------------------------------------------------------------------------
|
||||
*** WHEN SOMETHING GOES WRONG AND YOU MUST ***
|
||||
*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE ***
|
||||
|
||||
1) Query the Trilinos clone commit log.
|
||||
git log --oneline
|
||||
|
||||
2) Note the <SHA1> of the commit to the Trillinos clone
|
||||
immediately BEFORE the Kokkos snapshot commit.
|
||||
Copy this <SHA1> for use in the next command.
|
||||
|
||||
3) IF more than one outstanding commit then you can remove just the
|
||||
Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file.
|
||||
Remove or comment out the Kokkos snapshot commit entry.
|
||||
git rebase -i <SHA1>
|
||||
|
||||
4) IF the Kokkos snapshot commit is the one and only
|
||||
outstanding commit then remove just than commit.
|
||||
git reset --hard HEAD~1
|
||||
|
||||
------------------------------------------------------------------------
|
||||
*** REGARDING 'snapshot.py' TOOL ***
|
||||
|
||||
The 'snapshot.py' tool is developed and maintained by the
|
||||
Center for Computing Research (CCR)
|
||||
Software Engineering, Maintenance, and Support (SEMS) team.
|
||||
|
||||
Contact Brent Perschbacher <bmpersc@sandia.gov> for questions>
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
@ -1,20 +1,18 @@
|
||||
# Default settings common options
|
||||
|
||||
KOKKOS_PATH=../../lib/kokkos
|
||||
|
||||
#Options: OpenMP,Serial,Pthreads,Cuda
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8
|
||||
#KOKKOS_DEVICES ?= "OpenMP"
|
||||
KOKKOS_DEVICES ?= "Pthreads"
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL
|
||||
KOKKOS_ARCH ?= ""
|
||||
#Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
#Options: hwloc,librt
|
||||
#Options: hwloc,librt,experimental_memkind
|
||||
KOKKOS_USE_TPLS ?= ""
|
||||
#Options: c++11
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
#Options: kernel_times,aggregate_mpi
|
||||
KOKKOS_PROFILING ?= ""
|
||||
#Options: aggressive_vectorization
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
|
||||
#Default settings specific options
|
||||
#Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
@ -30,8 +28,10 @@ KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | gr
|
||||
# Check for external libraries
|
||||
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
|
||||
KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
|
||||
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
|
||||
|
||||
# Check for advanced settings
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
|
||||
KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
|
||||
KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l))
|
||||
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
|
||||
@ -50,10 +50,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
|
||||
endif
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
|
||||
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_CRAY := $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)
|
||||
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname | grep CYGWIN | wc -l)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
@ -93,8 +94,10 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
|
||||
|
||||
#NVIDIA based
|
||||
NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
|
||||
@ -135,8 +138,9 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
#Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
|
||||
#Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
|
||||
@ -225,6 +229,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
|
||||
KOKKOS_LIBS += -lrt
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
|
||||
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
|
||||
KOKKOS_LIBS += -lmemkind
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
tmp := $(shell echo "/* Optimization Settings */" >> KokkosCore_config.tmp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
|
||||
@ -265,8 +282,41 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2
|
||||
KOKKOS_LDFLAGS += -march=core-avx2
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX2
|
||||
KOKKOS_LDFLAGS += -xCORE-AVX2
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2
|
||||
KOKKOS_LDFLAGS += -march=core-avx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xMIC-AVX512
|
||||
KOKKOS_LDFLAGS += -xMIC-AVX512
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Asssume that this is really a GNU compiler
|
||||
KOKKOS_CXXFLAGS += -march=knl
|
||||
KOKKOS_LDFLAGS += -march=knl
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
|
||||
|
||||
@ -55,3 +55,8 @@ Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
|
||||
endif
|
||||
|
||||
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
Kokkos_HBWAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWAllocators.cpp
|
||||
|
||||
|
||||
@ -20,6 +20,13 @@ GTC 2015:
|
||||
A programming guide can be found under doc/Kokkos_PG.pdf. This is an initial version
|
||||
and feedback is greatly appreciated.
|
||||
|
||||
A separate repository with extensive tutorial material can be found under
|
||||
https://github.com/kokkos/kokkos-tutorials.
|
||||
|
||||
If you have a patch to contribute please feel free to issue a pull request against
|
||||
the develop branch. For major contributions it is better to contact us first
|
||||
for guidance.
|
||||
|
||||
For questions please send an email to
|
||||
kokkos-users@software.sandia.gov
|
||||
|
||||
@ -43,6 +50,7 @@ Primary tested compilers are:
|
||||
Secondary tested compilers are:
|
||||
CUDA 6.5 (with gcc 4.7.2)
|
||||
CUDA 7.0 (with gcc 4.7.2)
|
||||
CUDA 7.5 (with gcc 4.7.2)
|
||||
|
||||
Other compilers working:
|
||||
PGI 15.4
|
||||
|
||||
10
lib/kokkos/algorithms/CMakeLists.txt
Normal file
10
lib/kokkos/algorithms/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
|
||||
TRIBITS_SUBPACKAGE(Algorithms)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
|
||||
TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
|
||||
#TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
|
||||
|
||||
TRIBITS_SUBPACKAGE_POSTPROCESS()
|
||||
5
lib/kokkos/algorithms/cmake/Dependencies.cmake
Normal file
5
lib/kokkos/algorithms/cmake/Dependencies.cmake
Normal file
@ -0,0 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
4
lib/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
Normal file
4
lib/kokkos/algorithms/cmake/KokkosAlgorithms_config.h.in
Normal file
@ -0,0 +1,4 @@
|
||||
#ifndef KOKKOS_ALGORITHMS_CONFIG_H
|
||||
#define KOKKOS_ALGORITHMS_CONFIG_H
|
||||
|
||||
#endif
|
||||
21
lib/kokkos/algorithms/src/CMakeLists.txt
Normal file
21
lib/kokkos/algorithms/src/CMakeLists.txt
Normal file
@ -0,0 +1,21 @@
|
||||
|
||||
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS *.hpp)
|
||||
FILE(GLOB SOURCES *.cpp)
|
||||
LIST(APPEND HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
TRIBITS_ADD_LIBRARY(
|
||||
kokkosalgorithms
|
||||
HEADERS ${HEADERS}
|
||||
SOURCES ${SOURCES}
|
||||
DEPLIBS
|
||||
)
|
||||
|
||||
@ -45,7 +45,7 @@
|
||||
#define KOKKOS_RANDOM_HPP
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
//#include <Kokkos_Complex.hpp>
|
||||
#include <Kokkos_Complex.hpp>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cmath>
|
||||
@ -475,6 +475,58 @@ namespace Kokkos {
|
||||
|
||||
};
|
||||
|
||||
template<class Generator>
|
||||
struct rand<Generator, ::Kokkos::complex<float> > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> max () {
|
||||
return ::Kokkos::complex<float> (1.0, 1.0);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen) {
|
||||
const float re = gen.frand ();
|
||||
const float im = gen.frand ();
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& range) {
|
||||
const float re = gen.frand (real (range));
|
||||
const float im = gen.frand (imag (range));
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<float> draw (Generator& gen, const ::Kokkos::complex<float>& start, const ::Kokkos::complex<float>& end) {
|
||||
const float re = gen.frand (real (start), real (end));
|
||||
const float im = gen.frand (imag (start), imag (end));
|
||||
return ::Kokkos::complex<float> (re, im);
|
||||
}
|
||||
};
|
||||
|
||||
template<class Generator>
|
||||
struct rand<Generator, ::Kokkos::complex<double> > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> max () {
|
||||
return ::Kokkos::complex<double> (1.0, 1.0);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen) {
|
||||
const double re = gen.drand ();
|
||||
const double im = gen.drand ();
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& range) {
|
||||
const double re = gen.drand (real (range));
|
||||
const double im = gen.drand (imag (range));
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static ::Kokkos::complex<double> draw (Generator& gen, const ::Kokkos::complex<double>& start, const ::Kokkos::complex<double>& end) {
|
||||
const double re = gen.drand (real (start), real (end));
|
||||
const double im = gen.drand (imag (start), imag (end));
|
||||
return ::Kokkos::complex<double> (re, im);
|
||||
}
|
||||
};
|
||||
|
||||
template<class DeviceType>
|
||||
class Random_XorShift64_Pool;
|
||||
|
||||
|
||||
38
lib/kokkos/algorithms/unit_tests/CMakeLists.txt
Normal file
38
lib/kokkos/algorithms/unit_tests/CMakeLists.txt
Normal file
@ -0,0 +1,38 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
UnitTestMain.cpp
|
||||
TestCuda.cpp
|
||||
)
|
||||
|
||||
SET(LIBRARIES kokkoscore)
|
||||
|
||||
IF(Kokkos_ENABLE_OpenMP)
|
||||
LIST( APPEND SOURCES
|
||||
TestOpenMP.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Serial)
|
||||
LIST( APPEND SOURCES
|
||||
TestSerial.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Pthread)
|
||||
LIST( APPEND SOURCES
|
||||
TestThreads.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
|
||||
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = nvcc_wrapper
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
@ -56,7 +56,7 @@ KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Threads
|
||||
|
||||
|
||||
KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_OpenMP
|
||||
|
||||
@ -74,11 +74,11 @@ test-openmp: KokkosAlgorithms_UnitTest_OpenMP
|
||||
|
||||
test-serial: KokkosAlgorithms_UnitTest_Serial
|
||||
./KokkosAlgorithms_UnitTest_Serial
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(TARGETS)
|
||||
|
||||
|
||||
10
lib/kokkos/cmake/Dependencies.cmake
Normal file
10
lib/kokkos/cmake/Dependencies.cmake
Normal file
@ -0,0 +1,10 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
|
||||
#SubPackageName Directory Class Req/Opt
|
||||
#
|
||||
# New Kokkos subpackages:
|
||||
Core core PS REQUIRED
|
||||
Containers containers PS OPTIONAL
|
||||
Algorithms algorithms PS OPTIONAL
|
||||
Example example EX OPTIONAL
|
||||
)
|
||||
75
lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
Normal file
75
lib/kokkos/cmake/tpls/FindTPLCUSPARSE.cmake
Normal file
@ -0,0 +1,75 @@
|
||||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
# Check for CUDA support
|
||||
|
||||
IF (NOT TPL_ENABLE_CUDA OR CUDA_VERSION VERSION_LESS "4.1")
|
||||
MESSAGE(FATAL_ERROR "\nCUSPARSE: did not find acceptable version of CUDA libraries (4.1 or greater)")
|
||||
ELSE()
|
||||
IF(CMAKE_VERSION VERSION_LESS "2.8.8")
|
||||
# FindCUDA before CMake 2.8.8 does not find cusparse library; therefore, we must
|
||||
find_library(CUDA_cusparse_LIBRARY
|
||||
cusparse
|
||||
HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib
|
||||
)
|
||||
IF(CUDA_cusparse_LIBRARY STREQUAL "CUDA_cusparse_LIBRARY-NOTFOUND")
|
||||
MESSAGE(FATAL_ERROR "\nCUSPARSE: could not find cuspasre library.")
|
||||
ENDIF()
|
||||
ENDIF(CMAKE_VERSION VERSION_LESS "2.8.8")
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
|
||||
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
|
||||
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
|
||||
ENDIF()
|
||||
|
||||
71
lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake
Normal file
71
lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake
Normal file
@ -0,0 +1,71 @@
|
||||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Hardware locality detection and control library.
|
||||
#
|
||||
# Acquisition information:
|
||||
# Date checked: November 2011
|
||||
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
|
||||
# Source: http://www.open-mpi.org/projects/hwloc/
|
||||
# Version: 1.3
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
|
||||
REQUIRED_HEADERS hwloc.h
|
||||
REQUIRED_LIBS_NAMES "hwloc"
|
||||
)
|
||||
|
||||
82
lib/kokkos/cmake/tpls/FindTPLPthread.cmake
Normal file
82
lib/kokkos/cmake/tpls/FindTPLPthread.cmake
Normal file
@ -0,0 +1,82 @@
|
||||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
SET(USE_THREADS FALSE)
|
||||
|
||||
IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
|
||||
# Use CMake's Thread finder since it is a bit smarter in determining
|
||||
# whether pthreads is already built into the compiler and doesn't need
|
||||
# a library to link.
|
||||
FIND_PACKAGE(Threads)
|
||||
#If Threads found a copy of pthreads make sure it is one of the cases the tribits
|
||||
#tpl system cannot handle.
|
||||
IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
|
||||
IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
|
||||
SET(USE_THREADS TRUE)
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
ENDIF()
|
||||
|
||||
IF(USE_THREADS)
|
||||
SET(TPL_Pthread_INCLUDE_DIRS "")
|
||||
SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
|
||||
SET(TPL_Pthread_LIBRARY_DIRS "")
|
||||
ELSE()
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
|
||||
REQUIRED_HEADERS pthread.h
|
||||
REQUIRED_LIBS_NAMES pthread
|
||||
)
|
||||
ENDIF()
|
||||
70
lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
Normal file
70
lib/kokkos/cmake/tpls/FindTPLQTHREAD.cmake
Normal file
@ -0,0 +1,70 @@
|
||||
# @HEADER
|
||||
# ************************************************************************
|
||||
#
|
||||
# Trilinos: An Object-Oriented Solver Framework
|
||||
# Copyright (2001) Sandia Corporation
|
||||
#
|
||||
#
|
||||
# Copyright (2001) Sandia Corporation. Under the terms of Contract
|
||||
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
|
||||
# work by or on behalf of the U.S. Government. Export of this program
|
||||
# may require a license from the United States Government.
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright
|
||||
# notice, this list of conditions and the following disclaimer.
|
||||
#
|
||||
# 2. Redistributions in binary form must reproduce the above copyright
|
||||
# notice, this list of conditions and the following disclaimer in the
|
||||
# documentation and/or other materials provided with the distribution.
|
||||
#
|
||||
# 3. Neither the name of the Corporation nor the names of the
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
# NOTICE: The United States Government is granted for itself and others
|
||||
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
|
||||
# license in this data to reproduce, prepare derivative works, and
|
||||
# perform publicly and display publicly. Beginning five (5) years from
|
||||
# July 25, 2001, the United States Government is granted for itself and
|
||||
# others acting on its behalf a paid-up, nonexclusive, irrevocable
|
||||
# worldwide license in this data to reproduce, prepare derivative works,
|
||||
# distribute copies to the public, perform publicly and display
|
||||
# publicly, and to permit others to do so.
|
||||
#
|
||||
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
|
||||
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
|
||||
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
|
||||
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
|
||||
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
|
||||
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
|
||||
#
|
||||
# ************************************************************************
|
||||
# @HEADER
|
||||
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Hardware locality detection and control library.
|
||||
#
|
||||
# Acquisition information:
|
||||
# Date checked: July 2014
|
||||
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
|
||||
# Source: https://code.google.com/p/qthreads
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
|
||||
REQUIRED_HEADERS qthread.h
|
||||
REQUIRED_LIBS_NAMES "qthread"
|
||||
)
|
||||
|
||||
@ -23,36 +23,72 @@ default_arch="sm_35"
|
||||
#
|
||||
# The default C++ compiler.
|
||||
#
|
||||
default_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
|
||||
#default_compiler="icpc"
|
||||
#default_compiler="/usr/local/gcc/4.8.3/bin/g++"
|
||||
#default_compiler="/usr/local/gcc/4.9.1/bin/g++"
|
||||
host_compiler=${NVCC_WRAPPER_DEFAULT_COMPILER:-"g++"}
|
||||
#host_compiler="icpc"
|
||||
#host_compiler="/usr/local/gcc/4.8.3/bin/g++"
|
||||
#host_compiler="/usr/local/gcc/4.9.1/bin/g++"
|
||||
|
||||
#
|
||||
# Internal variables
|
||||
#
|
||||
|
||||
# C++ files
|
||||
cpp_files=""
|
||||
|
||||
# Host compiler arguments
|
||||
xcompiler_args=""
|
||||
cuda_arg=""
|
||||
|
||||
# Cuda (NVCC) only arguments
|
||||
cuda_args=""
|
||||
|
||||
# Arguments for both NVCC and Host compiler
|
||||
shared_args=""
|
||||
|
||||
# Linker arguments
|
||||
xlinker_args=""
|
||||
|
||||
# Object files passable to NVCC
|
||||
object_files=""
|
||||
|
||||
# Link objects for the host linker only
|
||||
object_files_xlinker=""
|
||||
first_host_option=1
|
||||
|
||||
# Does the User set the architecture
|
||||
arch_set=0
|
||||
|
||||
# Does the user overwrite the host compiler
|
||||
ccbin_set=0
|
||||
nvcc_error_code=0
|
||||
|
||||
#Error code of compilation
|
||||
error_code=0
|
||||
|
||||
# Do a dry run without actually compiling
|
||||
dry_run=0
|
||||
|
||||
# Skip NVCC compilation and use host compiler directly
|
||||
host_only=0
|
||||
|
||||
# Enable workaround for CUDA 6.5 for pragma ident
|
||||
replace_pragma_ident=0
|
||||
|
||||
# Mark first host compiler argument
|
||||
first_xcompiler_arg=1
|
||||
|
||||
temp_dir=${TMPDIR:-/tmp}
|
||||
|
||||
#echo "Arguments: $# $@"
|
||||
|
||||
while [ $# -gt 0 ]
|
||||
do
|
||||
case $1 in
|
||||
#show the executed command
|
||||
--show)
|
||||
--show|--nvcc-wrapper-show)
|
||||
dry_run=1
|
||||
;;
|
||||
#run host compilation only
|
||||
--host-only)
|
||||
host_only=1
|
||||
;;
|
||||
#replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
|
||||
--replace-pragma-ident)
|
||||
replace_pragma_ident=1
|
||||
@ -61,22 +97,31 @@ do
|
||||
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
|
||||
cpp_files="$cpp_files $1"
|
||||
;;
|
||||
#Handle shared args (valid for both nvcc and the host compiler)
|
||||
-O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#Handle shared args that have an argument
|
||||
-o)
|
||||
shared_args="$shared_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle known nvcc args
|
||||
-O*|-D*|-gencode*|-c|-I*|-L*|-l*|-g|--help|--version|--dryrun|--verbose|--keep-dir|-E|-M|-G|--relocatable-device-code*|-shared|-lineinfo|-expt-extended-lambda|--resource-usage)
|
||||
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage)
|
||||
cuda_args="$cuda_args $1"
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
-rdc|-maxrregcount|--default-stream)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#Handle c++11 setting
|
||||
--std=c++11|-std=c++11)
|
||||
cuda_args="$cuda_args $1"
|
||||
shared_args="$shared_args $1"
|
||||
;;
|
||||
#strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
|
||||
-std=c++98|--std=c++98)
|
||||
;;
|
||||
#Handle known nvcc args that have an argument
|
||||
-o|-rdc|-maxrregcount|--default-stream)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
shift
|
||||
;;
|
||||
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
|
||||
-pedantic|-Wpedantic|-ansi)
|
||||
;;
|
||||
@ -86,7 +131,12 @@ do
|
||||
#strip of "-x cu" because we add that
|
||||
-x)
|
||||
if [[ $2 != "cu" ]]; then
|
||||
xcompiler_args="$xcompiler_args,-x,$2"
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args="-x,$2"
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,-x,$2"
|
||||
fi
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
@ -94,6 +144,7 @@ do
|
||||
-ccbin)
|
||||
cuda_args="$cuda_args $1 $2"
|
||||
ccbin_set=1
|
||||
host_compiler=$2
|
||||
shift
|
||||
;;
|
||||
#Handle -arch argument (if its not set use a default
|
||||
@ -109,24 +160,25 @@ do
|
||||
#Handle args that should be sent to the linker
|
||||
-Wl*)
|
||||
xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
|
||||
host_linker_args="$host_linker_args ${1:4:${#1}}"
|
||||
;;
|
||||
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.a|*.so|*.o|*.obj)
|
||||
object_files="$object_files $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle object files: -x cu applies to all input files, so give them to linker, except if only linking
|
||||
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.so.*|*.dylib)
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
object_files="$object_files -Xlinker $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#All other args are sent to the host compiler
|
||||
*)
|
||||
if [ $first_host_option -eq 0 ]; then
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
xcompiler_args=$1
|
||||
first_xcompiler_arg=0
|
||||
else
|
||||
xcompiler_args="$xcompiler_args,$1"
|
||||
else
|
||||
xcompiler_args="-Xcompiler $1"
|
||||
first_host_option=0
|
||||
fi
|
||||
;;
|
||||
esac
|
||||
@ -136,7 +188,7 @@ done
|
||||
|
||||
#Add default host compiler if necessary
|
||||
if [ $ccbin_set -ne 1 ]; then
|
||||
cuda_args="$cuda_args -ccbin $default_compiler"
|
||||
cuda_args="$cuda_args -ccbin $host_compiler"
|
||||
fi
|
||||
|
||||
#Add architecture command
|
||||
@ -145,7 +197,13 @@ if [ $arch_set -ne 1 ]; then
|
||||
fi
|
||||
|
||||
#Compose compilation command
|
||||
command="nvcc $cuda_args $xlinker_args $xcompiler_args"
|
||||
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args"
|
||||
if [ $first_xcompiler_arg -eq 0 ]; then
|
||||
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
|
||||
fi
|
||||
|
||||
#Compose host only command
|
||||
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args"
|
||||
|
||||
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
|
||||
if [ $replace_pragma_ident -eq 1 ]; then
|
||||
@ -155,31 +213,45 @@ if [ $replace_pragma_ident -eq 1 ]; then
|
||||
var=`grep pragma ${file} | grep ident | grep "#"`
|
||||
if [ "${#var}" -gt 0 ]
|
||||
then
|
||||
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > /tmp/nvcc_wrapper_tmp_$file
|
||||
cpp_files2="$cpp_files2 /tmp/nvcc_wrapper_tmp_$file"
|
||||
sed 's/#[\ \t]*pragma[\ \t]*ident/#ident/g' $file > $temp_dir/nvcc_wrapper_tmp_$file
|
||||
cpp_files2="$cpp_files2 $temp_dir/nvcc_wrapper_tmp_$file"
|
||||
else
|
||||
cpp_files2="$cpp_files2 $file"
|
||||
fi
|
||||
done
|
||||
cpp_files=$cpp_files2
|
||||
echo $cpp_files
|
||||
#echo $cpp_files
|
||||
fi
|
||||
|
||||
if [ "$cpp_files" ]; then
|
||||
command="$command $object_files_xlinker -x cu $cpp_files"
|
||||
nvcc_command="$nvcc_command $object_files_xlinker -x cu $cpp_files"
|
||||
else
|
||||
command="$command $object_files"
|
||||
nvcc_command="$nvcc_command $object_files"
|
||||
fi
|
||||
|
||||
if [ "$cpp_files" ]; then
|
||||
host_command="$host_command $object_files $cpp_files"
|
||||
else
|
||||
host_command="$host_command $object_files"
|
||||
fi
|
||||
|
||||
#Print command for dryrun
|
||||
if [ $dry_run -eq 1 ]; then
|
||||
echo $command
|
||||
if [ $host_only -eq 1 ]; then
|
||||
echo $host_command
|
||||
else
|
||||
echo $nvcc_command
|
||||
fi
|
||||
exit 0
|
||||
fi
|
||||
|
||||
#Run compilation command
|
||||
$command
|
||||
nvcc_error_code=$?
|
||||
if [ $host_only -eq 1 ]; then
|
||||
$host_command
|
||||
else
|
||||
$nvcc_command
|
||||
fi
|
||||
error_code=$?
|
||||
|
||||
#Report error code
|
||||
exit $nvcc_error_code
|
||||
exit $error_code
|
||||
|
||||
@ -6,8 +6,6 @@
|
||||
|
||||
set -o pipefail
|
||||
|
||||
COMPILER_ROOT="/home/projects/x86-64"
|
||||
|
||||
GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
|
||||
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
|
||||
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
|
||||
@ -18,24 +16,17 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
|
||||
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
CUDA_WARNING_FLAGS=""
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 gcc/4.7.2/base,hwloc/1.10.0/host/gnu/4.7.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 gcc/4.9.2/base,hwloc/1.10.0/host/gnu/4.9.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 gcc/4.9.2/base,hwloc/1.10.0/host/gnu/4.9.2 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 gcc/5.1.0/base,hwloc/1.10.0/host/gnu/5.1.0 $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 intel/14.0.4/base,hwloc/1.10.0/host/gnu/4.7.2 $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 intel/15.0.2/base,hwloc/1.10.0/host/gnu/4.7.2 $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 clang/3.5.2/base $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 clang/3.6.1/base $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/6.5.14 cuda/6.5.14,nvcc-wrapper/gnu,gcc/4.7.2/base $CUDA_BUILD_LIST nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.0.28 cuda/7.0.18,nvcc-wrapper/gnu,gcc/4.7.2/base $CUDA_BUILD_LIST nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
export SEMS_MODULE_ROOT=/projects/modulefiles
|
||||
module use /home/projects/modulefiles
|
||||
module use /projects/modulefiles/rhel6-x86_64/sems/compiler
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
source /projects/modulefiles/utils/sems-modules-init.sh
|
||||
source /projects/modulefiles/utils/kokkos-modules-init.sh
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
@ -47,6 +38,9 @@ DEBUG=False
|
||||
ARGS=""
|
||||
CUSTOM_BUILD_LIST=""
|
||||
DRYRUN=False
|
||||
BUILD_ONLY=False
|
||||
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
|
||||
TEST_SCRIPT=False
|
||||
|
||||
while [[ $# > 0 ]]
|
||||
do
|
||||
@ -61,6 +55,15 @@ CUSTOM_BUILD_LIST="${key#*=}"
|
||||
--debug*)
|
||||
DEBUG=True
|
||||
;;
|
||||
--build-only*)
|
||||
BUILD_ONLY=True
|
||||
;;
|
||||
--test-script*)
|
||||
TEST_SCRIPT=True
|
||||
;;
|
||||
--num*)
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
|
||||
;;
|
||||
--dry-run*)
|
||||
DRYRUN=True
|
||||
;;
|
||||
@ -69,7 +72,10 @@ echo "test_all_sandia <ARGS> <OPTIONS>:"
|
||||
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
|
||||
echo " Defaults to root repo containing this script"
|
||||
echo "--debug: Run tests in debug. Defaults to False"
|
||||
echo "--test-script: Test this script, not Kokkos"
|
||||
echo "--num=N: Number of jobs to run in parallel "
|
||||
echo "--dry-run: Just print what would be executed"
|
||||
echo "--build-only: Just do builds, don't run anything"
|
||||
echo "--build-list=BUILD,BUILD,BUILD..."
|
||||
echo " Provide a comma-separated list of builds instead of running all builds"
|
||||
echo " Valid items:"
|
||||
@ -77,6 +83,18 @@ echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
|
||||
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
|
||||
echo ""
|
||||
echo "ARGS: list of expressions matching compilers to test"
|
||||
echo " supported compilers"
|
||||
echo " gcc/4.7.2"
|
||||
echo " gcc/4.8.4"
|
||||
echo " gcc/4.9.2"
|
||||
echo " gcc/5.1.0"
|
||||
echo " intel/14.0.4"
|
||||
echo " intel/15.0.2"
|
||||
echo " clang/3.5.2"
|
||||
echo " clang/3.6.1"
|
||||
echo " cuda/6.5.14"
|
||||
echo " cuda/7.0.28"
|
||||
echo " cuda/7.5.18"
|
||||
echo ""
|
||||
echo "Examples:"
|
||||
echo " Run all tests"
|
||||
@ -93,6 +111,10 @@ echo " % test_all_sandia --debug"
|
||||
echo ""
|
||||
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
|
||||
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
|
||||
echo ""
|
||||
echo "If you want to kill the tests, do:"
|
||||
echo " hit ctrl-z"
|
||||
echo " % kill -9 %1"
|
||||
echo
|
||||
exit 0
|
||||
;;
|
||||
@ -104,7 +126,6 @@ esac
|
||||
shift
|
||||
done
|
||||
|
||||
|
||||
# set kokkos path
|
||||
if [ -z "$KOKKOS_PATH" ]; then
|
||||
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
|
||||
@ -125,12 +146,26 @@ if [ -z "$ARGS" ]; then
|
||||
ARGS='?'
|
||||
fi
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
# Process args to figure out which compilers to test
|
||||
COMPILERS_TO_TEST=""
|
||||
for ARG in $ARGS; do
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
arr=($COMPILER_DATA)
|
||||
COMPILER=${arr[0]}
|
||||
ARR=($COMPILER_DATA)
|
||||
COMPILER=${ARR[0]}
|
||||
if [[ "$COMPILER" = $ARG* ]]; then
|
||||
if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
|
||||
COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
|
||||
@ -145,15 +180,28 @@ done
|
||||
# Functions
|
||||
#
|
||||
|
||||
# get_compiler_name <COMPILER>
|
||||
get_compiler_name() {
|
||||
echo $1 | cut -d/ -f1
|
||||
}
|
||||
|
||||
# get_compiler_version <COMPILER>
|
||||
get_compiler_version() {
|
||||
echo $1 | cut -d/ -f2
|
||||
}
|
||||
|
||||
# Do not call directly
|
||||
get_compiler_data() {
|
||||
compiler=$1
|
||||
item=$2
|
||||
local compiler=$1
|
||||
local item=$2
|
||||
local compiler_name=$(get_compiler_name $compiler)
|
||||
local compiler_vers=$(get_compiler_version $compiler)
|
||||
|
||||
local compiler_data
|
||||
for compiler_data in "${COMPILERS[@]}" ; do
|
||||
arr=($compiler_data)
|
||||
local arr=($compiler_data)
|
||||
if [ "$compiler" = "${arr[0]}" ]; then
|
||||
echo "${arr[$item]}" | tr , ' '
|
||||
echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
@ -186,33 +234,60 @@ get_compiler_warning_flags() {
|
||||
run_cmd() {
|
||||
echo "RUNNING: $*"
|
||||
if [ "$DRYRUN" != "True" ]; then
|
||||
eval "$*"
|
||||
eval "$* 2>&1"
|
||||
fi
|
||||
}
|
||||
|
||||
# report_and_log_test_results <SUCCESS> <DESC> <PHASE>
|
||||
report_and_log_test_result() {
|
||||
if [ "$1" = "0" ]; then
|
||||
echo "PASSED $2"
|
||||
TEST_RESULTS="${TEST_RESULTS}\nPASSED $2"
|
||||
# Use sane var names
|
||||
local success=$1; local desc=$2; local phase=$3;
|
||||
|
||||
if [ "$success" = "0" ]; then
|
||||
echo " PASSED $desc"
|
||||
touch $PASSED_DIR/$desc
|
||||
else
|
||||
echo "FAILED $2" >&2
|
||||
TEST_RESULTS="${TEST_RESULTS}\nFAILED $2 ($3)"
|
||||
NUM_FAILED+=1
|
||||
echo " FAILED $desc" >&2
|
||||
echo $phase > $FAILED_DIR/$desc
|
||||
cat ${desc}.${phase}.log
|
||||
fi
|
||||
}
|
||||
|
||||
setup_env() {
|
||||
local compiler=$1
|
||||
local compiler_modules=$(get_compiler_modules $compiler)
|
||||
|
||||
module purge
|
||||
|
||||
local mod
|
||||
for mod in $compiler_modules; do
|
||||
module load $mod 2>&1
|
||||
# It is ridiculously hard to check for the success of a loaded
|
||||
# module. Module does not return error codes and piping to grep
|
||||
# causes module to run in a subshell.
|
||||
module list 2>&1 | grep "$mod" >& /dev/null || return 1
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
|
||||
single_build_and_test() {
|
||||
# Use sane var names
|
||||
local compiler=$1; local build=$2; local build_type=$3;
|
||||
|
||||
cd $ROOT_DIR/$compiler
|
||||
# set up env
|
||||
mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
|
||||
cd $ROOT_DIR/$compiler/"${build}-$build_type"
|
||||
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
|
||||
setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
|
||||
# Set up flags
|
||||
local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
|
||||
local compiler_exe=$(get_compiler_exe_name $compiler)
|
||||
|
||||
if [[ "$build_type" = hwloc* ]]; then
|
||||
local extra_args="--with-hwloc=$HWLOC_ROOT"
|
||||
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
|
||||
fi
|
||||
|
||||
if [[ "$build_type" = *debug* ]]; then
|
||||
@ -222,36 +297,63 @@ single_build_and_test() {
|
||||
local cxxflags="-O3 $compiler_warning_flags"
|
||||
fi
|
||||
|
||||
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
|
||||
echo " Doing build: $desc"
|
||||
|
||||
mkdir "${build}-$build_type"
|
||||
cd "${build}-$build_type"
|
||||
if [[ "$compiler" == cuda* ]]; then
|
||||
cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
|
||||
export TMPDIR=$(pwd)
|
||||
fi
|
||||
|
||||
# cxxflags="-DKOKKOS_USING_EXPERIMENTAL_VIEW $cxxflags"
|
||||
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" \"$extra_args\" 2>&1 | tee ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
run_cmd make build-test 2>&1 | tee ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
run_cmd make test 2>&1 | tee ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
|
||||
echo " Starting job $desc"
|
||||
|
||||
if [ "$TEST_SCRIPT" = "True" ]; then
|
||||
local rand=$[ 1 + $[ RANDOM % 10 ]]
|
||||
sleep $rand
|
||||
if [ $rand -gt 5 ]; then
|
||||
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
|
||||
fi
|
||||
else
|
||||
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
if [[ "$BUILD_ONLY" == False ]]; then
|
||||
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
|
||||
fi
|
||||
fi
|
||||
|
||||
report_and_log_test_result 0 $desc
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
setup_env() {
|
||||
local compiler=$1
|
||||
local compiler_modules=$(get_compiler_modules $compiler)
|
||||
|
||||
module purge
|
||||
|
||||
for mod in $compiler_modules; do
|
||||
module load $mod
|
||||
# It is ridiculously hard to check for the success of a loaded
|
||||
# module. Module does not return error codes and piping to grep
|
||||
# causes module to run in a subshell.
|
||||
module list 2>&1 | grep "$mod"
|
||||
# wait_for_jobs <NUM-JOBS>
|
||||
wait_for_jobs() {
|
||||
local -i max_jobs=$1
|
||||
local -i num_active_jobs=$(jobs | wc -l)
|
||||
while [ $num_active_jobs -ge $max_jobs ]
|
||||
do
|
||||
sleep 1
|
||||
num_active_jobs=$(jobs | wc -l)
|
||||
jobs >& /dev/null
|
||||
done
|
||||
}
|
||||
|
||||
# run_in_background <COMPILER> <BUILD> <BUILD_TYPE>
|
||||
run_in_background() {
|
||||
local compiler=$1
|
||||
|
||||
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
|
||||
if [[ "$BUILD_ONLY" == True ]]; then
|
||||
num_jobs=8
|
||||
else
|
||||
if [[ "$compiler" == cuda* ]]; then
|
||||
num_jobs=1
|
||||
fi
|
||||
fi
|
||||
wait_for_jobs $num_jobs
|
||||
|
||||
single_build_and_test $* &
|
||||
}
|
||||
|
||||
# build_and_test_all <COMPILER>
|
||||
build_and_test_all() {
|
||||
# Get compiler data
|
||||
@ -262,44 +364,74 @@ build_and_test_all() {
|
||||
local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
|
||||
fi
|
||||
|
||||
# set up env
|
||||
cd $ROOT_DIR
|
||||
mkdir -p $compiler
|
||||
setup_env $compiler
|
||||
|
||||
# do builds
|
||||
local build
|
||||
for build in $compiler_build_list
|
||||
do
|
||||
single_build_and_test $compiler $build $BUILD_TYPE
|
||||
run_in_background $compiler $build $BUILD_TYPE
|
||||
|
||||
# If not cuda, do a hwloc test too
|
||||
if [[ "$compiler" != cuda* ]]; then
|
||||
single_build_and_test $compiler $build "hwloc-$BUILD_TYPE"
|
||||
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
|
||||
fi
|
||||
done
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
get_test_root_dir() {
|
||||
local existing_results=$(find . -maxdepth 1 -name "$RESULT_ROOT_PREFIX*" | sort)
|
||||
local -i num_existing_results=$(echo $existing_results | tr ' ' '\n' | wc -l)
|
||||
local -i num_to_delete=${num_existing_results}-${NUM_RESULTS_TO_KEEP}
|
||||
|
||||
if [ $num_to_delete -gt 0 ]; then
|
||||
/bin/rm -rf $(echo $existing_results | tr ' ' '\n' | head -n $num_to_delete)
|
||||
fi
|
||||
|
||||
echo $(pwd)/${RESULT_ROOT_PREFIX}_$(date +"%Y-%m-%d_%H.%M.%S")
|
||||
}
|
||||
|
||||
wait_summarize_and_exit() {
|
||||
wait_for_jobs 1
|
||||
|
||||
echo "#######################################################"
|
||||
echo "PASSED TESTS"
|
||||
echo "#######################################################"
|
||||
|
||||
\ls -1 $PASSED_DIR | sort
|
||||
|
||||
echo "#######################################################"
|
||||
echo "FAILED TESTS"
|
||||
echo "#######################################################"
|
||||
|
||||
local failed_test
|
||||
local -i rv=0
|
||||
for failed_test in $(\ls -1 $FAILED_DIR)
|
||||
do
|
||||
echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
|
||||
rv=$rv+1
|
||||
done
|
||||
|
||||
exit $rv
|
||||
}
|
||||
|
||||
#
|
||||
# Main
|
||||
#
|
||||
|
||||
/bin/rm -rf TestAll
|
||||
mkdir TestAll
|
||||
cd TestAll
|
||||
ROOT_DIR=$(get_test_root_dir)
|
||||
mkdir -p $ROOT_DIR
|
||||
cd $ROOT_DIR
|
||||
|
||||
TEST_RESULTS=""
|
||||
declare -i NUM_FAILED=0
|
||||
ROOT_DIR=$(pwd)
|
||||
PASSED_DIR=$ROOT_DIR/results/passed
|
||||
FAILED_DIR=$ROOT_DIR/results/failed
|
||||
mkdir -p $PASSED_DIR
|
||||
mkdir -p $FAILED_DIR
|
||||
|
||||
echo "Going to test compilers: " $COMPILERS_TO_TEST
|
||||
for COMPILER in $COMPILERS_TO_TEST; do
|
||||
echo "Testing compiler $COMPILER"
|
||||
build_and_test_all $COMPILER
|
||||
done
|
||||
|
||||
echo "#######################################################"
|
||||
echo "RESULT SUMMARY"
|
||||
echo "#######################################################"
|
||||
echo -e $TEST_RESULTS
|
||||
|
||||
exit $NUM_FAILED
|
||||
wait_summarize_and_exit
|
||||
|
||||
287
lib/kokkos/config/testing_scripts/obj_size_opt_check
Executable file
287
lib/kokkos/config/testing_scripts/obj_size_opt_check
Executable file
@ -0,0 +1,287 @@
|
||||
#! /usr/bin/env python
|
||||
|
||||
"""
|
||||
Compute the size at which the current compiler will start to
|
||||
significantly scale back optimization.
|
||||
|
||||
The CPP file being modified will need the following tags.
|
||||
// JGF_DUPLICATE_BEGIN - Put before start of function to duplicate
|
||||
// JGF_DUPLICATE_END - Put after end of function to duplcate
|
||||
// JGF_DUPE function_name(args); - Put anywhere where it's legal to
|
||||
put a function call but not in your timing section.
|
||||
|
||||
The program will need to output the string:
|
||||
FOM: <number>
|
||||
This will represent the program's performance
|
||||
"""
|
||||
|
||||
import argparse, sys, os, doctest, subprocess, re, time
|
||||
|
||||
VERBOSE = False
|
||||
|
||||
###############################################################################
|
||||
def parse_command_line(args, description):
|
||||
###############################################################################
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="""\n%s <cppfile> <build-command> <run-command> [--verbose]
|
||||
OR
|
||||
%s --help
|
||||
OR
|
||||
%s --test
|
||||
|
||||
\033[1mEXAMPLES:\033[0m
|
||||
> %s foo.cpp 'make -j4' foo
|
||||
""" % ((os.path.basename(args[0]), ) * 4),
|
||||
|
||||
description=description,
|
||||
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument("cppfile", help="Name of file to modify.")
|
||||
|
||||
parser.add_argument("buildcmd", help="Build command")
|
||||
|
||||
parser.add_argument("execmd", help="Run command")
|
||||
|
||||
parser.add_argument("-v", "--verbose", action="store_true",
|
||||
help="Print extra information")
|
||||
|
||||
parser.add_argument("-s", "--start", type=int, default=1,
|
||||
help="Starting number of dupes")
|
||||
|
||||
parser.add_argument("-e", "--end", type=int, default=1000,
|
||||
help="Ending number of dupes")
|
||||
|
||||
parser.add_argument("-n", "--repeat", type=int, default=10,
|
||||
help="Number of times to repeat an individial execution. Best value will be taken.")
|
||||
|
||||
parser.add_argument("-t", "--template", action="store_true",
|
||||
help="Use templating instead of source copying to increase object size")
|
||||
|
||||
parser.add_argument("-c", "--csv", action="store_true",
|
||||
help="Print results as CSV")
|
||||
|
||||
args = parser.parse_args(args[1:])
|
||||
|
||||
if (args.verbose):
|
||||
global VERBOSE
|
||||
VERBOSE = True
|
||||
|
||||
return args.cppfile, args.buildcmd, args.execmd, args.start, args.end, args.repeat, args.template, args.csv
|
||||
|
||||
###############################################################################
|
||||
def verbose_print(msg, override=None):
|
||||
###############################################################################
|
||||
if ( (VERBOSE and not override is False) or override):
|
||||
print msg
|
||||
|
||||
###############################################################################
|
||||
def error_print(msg):
|
||||
###############################################################################
|
||||
print >> sys.stderr, msg
|
||||
|
||||
###############################################################################
|
||||
def expect(condition, error_msg):
|
||||
###############################################################################
|
||||
"""
|
||||
Similar to assert except doesn't generate an ugly stacktrace. Useful for
|
||||
checking user error, not programming error.
|
||||
"""
|
||||
if (not condition):
|
||||
raise SystemExit("FAIL: %s" % error_msg)
|
||||
|
||||
###############################################################################
|
||||
def run_cmd(cmd, ok_to_fail=False, input_str=None, from_dir=None, verbose=None,
|
||||
arg_stdout=subprocess.PIPE, arg_stderr=subprocess.PIPE):
|
||||
###############################################################################
|
||||
verbose_print("RUN: %s" % cmd, verbose)
|
||||
|
||||
if (input_str is not None):
|
||||
stdin = subprocess.PIPE
|
||||
else:
|
||||
stdin = None
|
||||
|
||||
proc = subprocess.Popen(cmd,
|
||||
shell=True,
|
||||
stdout=arg_stdout,
|
||||
stderr=arg_stderr,
|
||||
stdin=stdin,
|
||||
cwd=from_dir)
|
||||
output, errput = proc.communicate(input_str)
|
||||
output = output.strip() if output is not None else output
|
||||
stat = proc.wait()
|
||||
|
||||
if (ok_to_fail):
|
||||
return stat, output, errput
|
||||
else:
|
||||
if (arg_stderr is not None):
|
||||
errput = errput if errput is not None else open(arg_stderr.name, "r").read()
|
||||
expect(stat == 0, "Command: '%s' failed with error '%s'" % (cmd, errput))
|
||||
else:
|
||||
expect(stat == 0, "Command: '%s' failed. See terminal output" % cmd)
|
||||
return output
|
||||
|
||||
###############################################################################
|
||||
def build_and_run(source, cppfile, buildcmd, execmd, repeat):
|
||||
###############################################################################
|
||||
open(cppfile, 'w').writelines(source)
|
||||
|
||||
run_cmd(buildcmd)
|
||||
|
||||
best = None
|
||||
for i in xrange(repeat):
|
||||
wait_for_quiet_machine()
|
||||
output = run_cmd(execmd)
|
||||
|
||||
current = None
|
||||
fom_regex = re.compile(r'^FOM: ([0-9.]+)$')
|
||||
for line in output.splitlines():
|
||||
m = fom_regex.match(line)
|
||||
if (m is not None):
|
||||
current = float(m.groups()[0])
|
||||
break
|
||||
|
||||
expect(current is not None, "No lines in output matched FOM regex")
|
||||
|
||||
if (best is None or best < current):
|
||||
best = current
|
||||
|
||||
return best
|
||||
|
||||
###############################################################################
|
||||
def wait_for_quiet_machine():
|
||||
###############################################################################
|
||||
while(True):
|
||||
time.sleep(2)
|
||||
|
||||
# The first iteration of top gives garbage results
|
||||
idle_pct_raw = run_cmd("top -bn2 | grep 'Cpu(s)' | tr ',' ' ' | tail -n 1 | awk '{print $5}'")
|
||||
|
||||
idle_pct_re = re.compile(r'^([0-9.]+)%id$')
|
||||
m = idle_pct_re.match(idle_pct_raw)
|
||||
|
||||
expect(m is not None, "top not returning output in expected form")
|
||||
|
||||
idle_pct = float(m.groups()[0])
|
||||
if (idle_pct < 95):
|
||||
error_print("Machine is too busy, waiting for it to become free")
|
||||
else:
|
||||
break
|
||||
|
||||
###############################################################################
|
||||
def add_n_dupes(curr_lines, num_dupes, template):
|
||||
###############################################################################
|
||||
function_name = None
|
||||
function_invocation = None
|
||||
function_lines = []
|
||||
|
||||
function_re = re.compile(r'^.* (\w+) *[(]')
|
||||
function_inv_re = re.compile(r'^.*JGF_DUPE: +(.+)$')
|
||||
|
||||
# Get function lines
|
||||
record = False
|
||||
definition_insertion_point = None
|
||||
invocation_insertion_point = None
|
||||
for idx, line in enumerate(curr_lines):
|
||||
if ("JGF_DUPLICATE_BEGIN" in line):
|
||||
record = True
|
||||
m = function_re.match(curr_lines[idx+1])
|
||||
expect(m is not None, "Could not find function in line '%s'" % curr_lines[idx+1])
|
||||
function_name = m.groups()[0]
|
||||
|
||||
elif ("JGF_DUPLICATE_END" in line):
|
||||
record = False
|
||||
definition_insertion_point = idx + 1
|
||||
|
||||
elif (record):
|
||||
function_lines.append(line)
|
||||
|
||||
elif ("JGF_DUPE" in line):
|
||||
m = function_inv_re.match(line)
|
||||
expect(m is not None, "Could not find function invocation example in line '%s'" % line)
|
||||
function_invocation = m.groups()[0]
|
||||
invocation_insertion_point = idx + 1
|
||||
|
||||
expect(function_name is not None, "Could not find name of dupe function")
|
||||
expect(function_invocation is not None, "Could not find function invocation point")
|
||||
|
||||
expect(definition_insertion_point < invocation_insertion_point, "fix me")
|
||||
|
||||
dupe_func_defs = []
|
||||
dupe_invocations = ["int jgf_rand = std::rand();\n", "if (false) {}\n"]
|
||||
|
||||
for i in xrange(num_dupes):
|
||||
if (not template):
|
||||
dupe_func = list(function_lines)
|
||||
dupe_func[0] = dupe_func[0].replace(function_name, "%s%d" % (function_name, i))
|
||||
dupe_func_defs.extend(dupe_func)
|
||||
|
||||
dupe_invocations.append("else if (jgf_rand == %d) " % i)
|
||||
if (template):
|
||||
dupe_call = function_invocation.replace(function_name, "%s<%d>" % (function_name, i)) + "\n"
|
||||
else:
|
||||
dupe_call = function_invocation.replace(function_name, "%s%d" % (function_name, i)) + "\n"
|
||||
dupe_invocations.append(dupe_call)
|
||||
|
||||
curr_lines[invocation_insertion_point:invocation_insertion_point] = dupe_invocations
|
||||
curr_lines[definition_insertion_point:definition_insertion_point] = dupe_func_defs
|
||||
|
||||
###############################################################################
|
||||
def report(num_dupes, curr_lines, object_file, orig_fom, curr_fom, csv=False, is_first_report=False):
|
||||
###############################################################################
|
||||
fom_change = (curr_fom - orig_fom) / orig_fom
|
||||
|
||||
if (csv):
|
||||
if (is_first_report):
|
||||
print "num_dupes, obj_byte_size, loc, fom, pct_diff"
|
||||
|
||||
print "%s, %s, %s, %s, %s" % (num_dupes, os.path.getsize(object_file), len(curr_lines), curr_fom, fom_change*100)
|
||||
else:
|
||||
print "========================================================"
|
||||
print "For number of dupes:", num_dupes
|
||||
print "Object file size (bytes):", os.path.getsize(object_file)
|
||||
print "Lines of code:", len(curr_lines)
|
||||
print "Field of merit:", curr_fom
|
||||
print "Change pct:", fom_change*100
|
||||
|
||||
###############################################################################
|
||||
def obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv=False):
|
||||
###############################################################################
|
||||
orig_source_lines = open(cppfile, 'r').readlines()
|
||||
|
||||
backup_file = "%s.orig" % cppfile
|
||||
object_file = "%s.o" % os.path.splitext(cppfile)[0]
|
||||
os.rename(cppfile, backup_file)
|
||||
|
||||
orig_fom = build_and_run(orig_source_lines, cppfile, buildcmd, execmd, repeat)
|
||||
report(0, orig_source_lines, object_file, orig_fom, orig_fom, csv=csv, is_first_report=True)
|
||||
|
||||
i = start
|
||||
while (i < end):
|
||||
curr_lines = list(orig_source_lines)
|
||||
add_n_dupes(curr_lines, i, template)
|
||||
|
||||
curr_fom = build_and_run(curr_lines, cppfile, buildcmd, execmd, repeat)
|
||||
|
||||
report(i, curr_lines, object_file, orig_fom, curr_fom, csv=csv)
|
||||
|
||||
i *= 2 # make growth function configurable?
|
||||
|
||||
os.remove(cppfile)
|
||||
os.rename(backup_file, cppfile)
|
||||
|
||||
###############################################################################
|
||||
def _main_func(description):
|
||||
###############################################################################
|
||||
if ("--test" in sys.argv):
|
||||
test_results = doctest.testmod(verbose=True)
|
||||
sys.exit(1 if test_results.failed > 0 else 0)
|
||||
|
||||
cppfile, buildcmd, execmd, start, end, repeat, template, csv = parse_command_line(sys.argv, description)
|
||||
|
||||
obj_size_opt_check(cppfile, buildcmd, execmd, start, end, repeat, template, csv)
|
||||
|
||||
###############################################################################
|
||||
if (__name__ == "__main__"):
|
||||
_main_func(__doc__)
|
||||
10
lib/kokkos/containers/CMakeLists.txt
Normal file
10
lib/kokkos/containers/CMakeLists.txt
Normal file
@ -0,0 +1,10 @@
|
||||
|
||||
|
||||
TRIBITS_SUBPACKAGE(Containers)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
|
||||
TRIBITS_ADD_TEST_DIRECTORIES(unit_tests)
|
||||
TRIBITS_ADD_TEST_DIRECTORIES(performance_tests)
|
||||
|
||||
TRIBITS_SUBPACKAGE_POSTPROCESS()
|
||||
5
lib/kokkos/containers/cmake/Dependencies.cmake
Normal file
5
lib/kokkos/containers/cmake/Dependencies.cmake
Normal file
@ -0,0 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
4
lib/kokkos/containers/cmake/KokkosContainers_config.h.in
Normal file
4
lib/kokkos/containers/cmake/KokkosContainers_config.h.in
Normal file
@ -0,0 +1,4 @@
|
||||
#ifndef KOKKOS_CONTAINERS_CONFIG_H
|
||||
#define KOKKOS_CONTAINERS_CONFIG_H
|
||||
|
||||
#endif
|
||||
26
lib/kokkos/containers/performance_tests/CMakeLists.txt
Normal file
26
lib/kokkos/containers/performance_tests/CMakeLists.txt
Normal file
@ -0,0 +1,26 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
TestMain.cpp
|
||||
TestCuda.cpp
|
||||
)
|
||||
|
||||
IF(Kokkos_ENABLE_Pthread)
|
||||
LIST( APPEND SOURCES TestThreads.cpp)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_OpenMP)
|
||||
LIST( APPEND SOURCES TestOpenMP.cpp)
|
||||
ENDIF()
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerformanceTest
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
|
||||
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = nvcc_wrapper
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
@ -50,7 +50,7 @@ KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
|
||||
|
||||
|
||||
KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
@ -63,11 +63,11 @@ test-threads: KokkosContainers_PerformanceTest_Threads
|
||||
test-openmp: KokkosContainers_PerformanceTest_OpenMP
|
||||
./KokkosContainers_PerformanceTest_OpenMP
|
||||
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(TARGETS)
|
||||
|
||||
|
||||
31
lib/kokkos/containers/src/CMakeLists.txt
Normal file
31
lib/kokkos/containers/src/CMakeLists.txt
Normal file
@ -0,0 +1,31 @@
|
||||
|
||||
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
SET(HEADERS "")
|
||||
SET(SOURCES "")
|
||||
|
||||
SET(HEADERS_IMPL "")
|
||||
|
||||
FILE(GLOB HEADERS *.hpp)
|
||||
FILE(GLOB HEADERS_IMPL impl/*.hpp)
|
||||
FILE(GLOB SOURCES impl/*.cpp)
|
||||
|
||||
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
|
||||
|
||||
INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
|
||||
|
||||
TRIBITS_ADD_LIBRARY(
|
||||
kokkoscontainers
|
||||
HEADERS ${HEADERS}
|
||||
NOINSTALLHEADERS ${HEADERS_IMPL}
|
||||
SOURCES ${SOURCES}
|
||||
DEPLIBS
|
||||
)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
@ -90,7 +90,7 @@ public:
|
||||
private:
|
||||
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
|
||||
enum { block_mask = block_size-1u };
|
||||
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
|
||||
enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
|
||||
|
||||
public:
|
||||
|
||||
@ -322,7 +322,7 @@ public:
|
||||
private:
|
||||
enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
|
||||
enum { block_mask = block_size -1u };
|
||||
enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
|
||||
enum { block_shift = Kokkos::Impl::integral_power_of_two(block_size) };
|
||||
|
||||
public:
|
||||
ConstBitset()
|
||||
|
||||
@ -106,9 +106,9 @@ public:
|
||||
|
||||
//! The type of a Kokkos::View on the device.
|
||||
typedef View< typename traits::data_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::device_type ,
|
||||
typename traits::memory_traits > t_dev ;
|
||||
Arg1Type ,
|
||||
Arg2Type ,
|
||||
Arg3Type > t_dev ;
|
||||
|
||||
/// \typedef t_host
|
||||
/// \brief The type of a Kokkos::View host mirror of \c t_dev.
|
||||
@ -117,9 +117,9 @@ public:
|
||||
//! The type of a const View on the device.
|
||||
//! The type of a Kokkos::View on the device.
|
||||
typedef View< typename traits::const_data_type ,
|
||||
typename traits::array_layout ,
|
||||
typename traits::device_type ,
|
||||
typename traits::memory_traits > t_dev_const ;
|
||||
Arg1Type ,
|
||||
Arg2Type ,
|
||||
Arg3Type > t_dev_const ;
|
||||
|
||||
/// \typedef t_host_const
|
||||
/// \brief The type of a const View host mirror of \c t_dev_const.
|
||||
@ -221,6 +221,19 @@ public:
|
||||
modified_host (src.modified_host)
|
||||
{}
|
||||
|
||||
//! Subview constructor
|
||||
template< class SD, class S1 , class S2 , class S3
|
||||
, class Arg0 , class ... Args >
|
||||
DualView( const DualView<SD,S1,S2,S3> & src
|
||||
, const Arg0 & arg0
|
||||
, Args ... args
|
||||
)
|
||||
: d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
|
||||
, h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
|
||||
, modified_device (src.modified_device)
|
||||
, modified_host (src.modified_host)
|
||||
{}
|
||||
|
||||
/// \brief Create DualView from existing device and host View objects.
|
||||
///
|
||||
/// This constructor assumes that the device and host View objects
|
||||
@ -237,7 +250,30 @@ public:
|
||||
modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
|
||||
modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
|
||||
{
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
|
||||
#else
|
||||
if ( d_view.rank != h_view.rank ||
|
||||
d_view.dimension_0() != h_view.dimension_0() ||
|
||||
d_view.dimension_1() != h_view.dimension_1() ||
|
||||
d_view.dimension_2() != h_view.dimension_2() ||
|
||||
d_view.dimension_3() != h_view.dimension_3() ||
|
||||
d_view.dimension_4() != h_view.dimension_4() ||
|
||||
d_view.dimension_5() != h_view.dimension_5() ||
|
||||
d_view.dimension_6() != h_view.dimension_6() ||
|
||||
d_view.dimension_7() != h_view.dimension_7() ||
|
||||
d_view.stride_0() != h_view.stride_0() ||
|
||||
d_view.stride_1() != h_view.stride_1() ||
|
||||
d_view.stride_2() != h_view.stride_2() ||
|
||||
d_view.stride_3() != h_view.stride_3() ||
|
||||
d_view.stride_4() != h_view.stride_4() ||
|
||||
d_view.stride_5() != h_view.stride_5() ||
|
||||
d_view.stride_6() != h_view.stride_6() ||
|
||||
d_view.stride_7() != h_view.stride_7() ||
|
||||
d_view.span() != h_view.span() ) {
|
||||
Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
//@}
|
||||
@ -501,6 +537,52 @@ public:
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// Partial specializations of Kokkos::subview() for DualView objects.
|
||||
//
|
||||
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class D, class A1, class A2, class A3, class ... Args >
|
||||
struct DualViewSubview {
|
||||
|
||||
typedef typename Kokkos::Experimental::Impl::ViewMapping
|
||||
< void
|
||||
, Kokkos::ViewTraits< D, A1, A2, A3 >
|
||||
, Args ...
|
||||
>::traits_type dst_traits ;
|
||||
|
||||
typedef Kokkos::DualView
|
||||
< typename dst_traits::data_type
|
||||
, typename dst_traits::array_layout
|
||||
, typename dst_traits::device_type
|
||||
, typename dst_traits::memory_traits
|
||||
> type ;
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 , class ... Args >
|
||||
typename Impl::DualViewSubview<D,A1,A2,A3,Args...>::type
|
||||
subview( const DualView<D,A1,A2,A3> & src , Args ... args )
|
||||
{
|
||||
return typename
|
||||
Impl::DualViewSubview<D,A1,A2,A3,Args...>::type( src , args ... );
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#else
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// Partial specializations of Kokkos::subview() for DualView objects.
|
||||
//
|
||||
@ -839,6 +921,15 @@ subview( const DualView<D,A1,A2,A3> & src ,
|
||||
return sub_view;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
//
|
||||
// Partial specialization of Kokkos::deep_copy() for DualView objects.
|
||||
//
|
||||
|
||||
@ -53,12 +53,8 @@
|
||||
*/
|
||||
namespace Kokkos {
|
||||
|
||||
template <typename Scalar, class Space = Kokkos::DefaultExecutionSpace >
|
||||
class vector : public DualView<Scalar*,LayoutLeft,Space> {
|
||||
public:
|
||||
typedef typename Space::memory_space memory_space;
|
||||
typedef typename Space::execution_space execution_space;
|
||||
typedef typename Kokkos::Device<execution_space,memory_space> device_type;
|
||||
template< class Scalar, class Arg1Type = void>
|
||||
class vector : public DualView<Scalar*,LayoutLeft,Arg1Type> {
|
||||
|
||||
typedef Scalar value_type;
|
||||
typedef Scalar* pointer;
|
||||
@ -72,7 +68,7 @@ private:
|
||||
size_t _size;
|
||||
typedef size_t size_type;
|
||||
float _extra_storage;
|
||||
typedef DualView<Scalar*,LayoutLeft,Space> DV;
|
||||
typedef DualView<Scalar*,LayoutLeft,Arg1Type> DV;
|
||||
|
||||
|
||||
public:
|
||||
@ -93,7 +89,7 @@ public:
|
||||
};
|
||||
|
||||
|
||||
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Space>("Vector",size_t(n*(1.1))) {
|
||||
vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
|
||||
_size = n;
|
||||
_extra_storage = 1.1;
|
||||
DV::modified_host() = 1;
|
||||
|
||||
40
lib/kokkos/containers/unit_tests/CMakeLists.txt
Normal file
40
lib/kokkos/containers/unit_tests/CMakeLists.txt
Normal file
@ -0,0 +1,40 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
UnitTestMain.cpp
|
||||
TestCuda.cpp
|
||||
)
|
||||
|
||||
SET(LIBRARIES kokkoscore)
|
||||
|
||||
IF(Kokkos_ENABLE_Pthread)
|
||||
LIST( APPEND SOURCES
|
||||
TestThreads.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Serial)
|
||||
LIST( APPEND SOURCES
|
||||
TestSerial.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_OpenMP)
|
||||
LIST( APPEND SOURCES
|
||||
TestOpenMP.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
|
||||
@ -6,12 +6,12 @@ vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
|
||||
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = nvcc_wrapper
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
@ -56,7 +56,7 @@ KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Threads
|
||||
|
||||
|
||||
KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_OpenMP
|
||||
|
||||
@ -74,11 +74,11 @@ test-openmp: KokkosContainers_UnitTest_OpenMP
|
||||
|
||||
test-serial: KokkosContainers_UnitTest_Serial
|
||||
./KokkosContainers_UnitTest_Serial
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(TARGETS)
|
||||
|
||||
|
||||
@ -1,12 +1,12 @@
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -35,7 +35,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
#ifndef KOKKOS_TEST_COMPLEX_HPP
|
||||
#define KOKKOS_TEST_COMPLEX_HPP
|
||||
|
||||
//#include <Kokkos_Complex.hpp>
|
||||
#include <Kokkos_Complex.hpp>
|
||||
#include <gtest/gtest.h>
|
||||
#include <iostream>
|
||||
|
||||
@ -124,14 +124,13 @@ namespace Impl {
|
||||
|
||||
complex_type z1 (1.0, -1.0);
|
||||
complex_type z2 (-1.0, 1.0);
|
||||
complex_type z3 = z1 - z2;
|
||||
ASSERT_TRUE( z3 == complex_type (2.0, -2.0) );
|
||||
complex_type z3 = z1 * z2;
|
||||
ASSERT_TRUE( z3 == complex_type (0.0, 2.0) );
|
||||
|
||||
// Test unary minus.
|
||||
complex_type z4 (3.0, -4.0);
|
||||
ASSERT_TRUE( z4 == complex_type (3.0, -4.0) );
|
||||
ASSERT_TRUE( -z4 == complex_type (-3.0, 4.0) );
|
||||
ASSERT_TRUE( z4 == -complex_type (-3.0, 4.0) );
|
||||
// Make sure that std::complex * Kokkos::complex works too.
|
||||
std::complex<RealType> z4 (-1.0, 1.0);
|
||||
complex_type z5 = z4 * z1;
|
||||
ASSERT_TRUE( z5 == complex_type (0.0, 2.0) );
|
||||
}
|
||||
|
||||
template <typename RealType>
|
||||
@ -208,7 +207,7 @@ namespace Impl {
|
||||
|
||||
typedef Kokkos::View<const Kokkos::complex<RealType>*, Device> view_type;
|
||||
typedef typename view_type::size_type size_type;
|
||||
typedef Kokkos::complex<RealType> value_type;
|
||||
typedef Kokkos::complex<RealType> value_type;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator () (const size_type i, Kokkos::complex<RealType>& sum) const {
|
||||
|
||||
11
lib/kokkos/core/CMakeLists.txt
Normal file
11
lib/kokkos/core/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
|
||||
|
||||
TRIBITS_SUBPACKAGE(Core)
|
||||
|
||||
ADD_SUBDIRECTORY(src)
|
||||
|
||||
TRIBITS_ADD_TEST_DIRECTORIES(unit_test)
|
||||
TRIBITS_ADD_TEST_DIRECTORIES(perf_test)
|
||||
|
||||
TRIBITS_SUBPACKAGE_POSTPROCESS()
|
||||
|
||||
4
lib/kokkos/core/cmake/Dependencies.cmake
Normal file
4
lib/kokkos/core/cmake/Dependencies.cmake
Normal file
@ -0,0 +1,4 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
50
lib/kokkos/core/cmake/KokkosCore_config.h.in
Normal file
50
lib/kokkos/core/cmake/KokkosCore_config.h.in
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef KOKKOS_CORE_CONFIG_H
|
||||
#define KOKKOS_CORE_CONFIG_H
|
||||
|
||||
/* The trivial 'src/build_common.sh' creates a config
|
||||
* that must stay in sync with this file.
|
||||
*/
|
||||
#cmakedefine KOKKOS_FOR_SIERRA
|
||||
|
||||
#if !defined( KOKKOS_FOR_SIERRA )
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_MPI
|
||||
#cmakedefine KOKKOS_HAVE_CUDA
|
||||
|
||||
// mfh 16 Sep 2014: If passed in on the command line, that overrides
|
||||
// any value of KOKKOS_USE_CUDA_UVM here. Doing this should prevent build
|
||||
// warnings like this one:
|
||||
//
|
||||
// packages/kokkos/core/src/KokkosCore_config.h:13:1: warning: "KOKKOS_USE_CUDA_UVM" redefined
|
||||
//
|
||||
// At some point, we should edit the test-build scripts in
|
||||
// Trilinos/cmake/ctest/drivers/perseus/, and take
|
||||
// -DKOKKOS_USE_CUDA_UVM from the command-line arguments there. I
|
||||
// hesitate to do that now, because I'm not sure if all the files are
|
||||
// including KokkosCore_config.h (or a header file that includes it) like
|
||||
// they should.
|
||||
|
||||
#if ! defined(KOKKOS_USE_CUDA_UVM)
|
||||
#cmakedefine KOKKOS_USE_CUDA_UVM
|
||||
#endif // ! defined(KOKKOS_USE_CUDA_UVM)
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_PTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_SERIAL
|
||||
#cmakedefine KOKKOS_HAVE_QTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_Winthread
|
||||
#cmakedefine KOKKOS_HAVE_OPENMP
|
||||
#cmakedefine KOKKOS_HAVE_HWLOC
|
||||
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
#cmakedefine KOKKOS_HAVE_CXX11
|
||||
#cmakedefine KOKKOS_HAVE_CUSPARSE
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_COLLECT_KERNEL_DATA
|
||||
#cmakedefine KOKKOS_ENABLE_PROFILING_AGGREGATE_MPI
|
||||
|
||||
// Don't forbid users from defining this macro on the command line,
|
||||
// but still make sure that CMake logic can control its definition.
|
||||
#if ! defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
|
||||
#cmakedefine KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif // KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
|
||||
#endif // KOKKOS_FOR_SIERRA
|
||||
#endif // KOKKOS_CORE_CONFIG_H
|
||||
18
lib/kokkos/core/perf_test/CMakeLists.txt
Normal file
18
lib/kokkos/core/perf_test/CMakeLists.txt
Normal file
@ -0,0 +1,18 @@
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINRARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
SET(SOURCES
|
||||
PerfTestMain.cpp
|
||||
PerfTestHost.cpp
|
||||
PerfTestCuda.cpp
|
||||
)
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
PerfTest
|
||||
SOURCES ${SOURCES}
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
@ -1,17 +1,17 @@
|
||||
KOKKOS_PATH = ../..
|
||||
|
||||
GTEST_PATH = ../../TPL/gtest
|
||||
GTEST_PATH = ../../tpls/gtest
|
||||
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/perf_test
|
||||
|
||||
default: build_all
|
||||
echo "End Build"
|
||||
|
||||
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = nvcc_wrapper
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = $(CXX)
|
||||
LDFLAGS ?= -lpthread
|
||||
@ -47,12 +47,12 @@ test-performance: KokkosCore_PerformanceTest
|
||||
|
||||
test-atomic: KokkosCore_PerformanceTest_Atomics
|
||||
./KokkosCore_PerformanceTest_Atomics
|
||||
|
||||
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
|
||||
|
||||
clean: kokkos-clean
|
||||
rm -f *.o $(TARGETS)
|
||||
|
||||
|
||||
@ -174,7 +174,7 @@ struct TextureFetch
|
||||
TEST_F( cuda, texture_double )
|
||||
{
|
||||
printf("Random reduce of double through texture fetch\n");
|
||||
for (int i=1; i<=27; ++i) {
|
||||
for (int i=1; i<=26; ++i) {
|
||||
int size = 1<<i;
|
||||
double time = 0;
|
||||
double reduce = 0;
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -167,7 +167,7 @@ T AddLoopSerial(int loop) {
|
||||
*data+=(T)1;
|
||||
|
||||
T val = *data;
|
||||
delete data;
|
||||
delete [] data;
|
||||
return val;
|
||||
}
|
||||
|
||||
@ -272,7 +272,7 @@ T CASLoopSerial(int loop) {
|
||||
}
|
||||
|
||||
T val = *data;
|
||||
delete data;
|
||||
delete [] data;
|
||||
return val;
|
||||
}
|
||||
|
||||
@ -373,8 +373,8 @@ T ExchLoopSerial(int loop) {
|
||||
}
|
||||
|
||||
T val = *data2 + *data;
|
||||
delete data;
|
||||
delete data2;
|
||||
delete [] data;
|
||||
delete [] data2;
|
||||
return val;
|
||||
}
|
||||
|
||||
|
||||
113
lib/kokkos/core/src/CMakeLists.txt
Normal file
113
lib/kokkos/core/src/CMakeLists.txt
Normal file
@ -0,0 +1,113 @@
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_Serial
|
||||
KOKKOS_HAVE_SERIAL
|
||||
"Whether to enable the Kokkos::Serial device. This device executes \"parallel\" kernels sequentially on a single CPU thread. It is enabled by default. If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads."
|
||||
ON
|
||||
)
|
||||
|
||||
ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
|
||||
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
|
||||
|
||||
# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
|
||||
# use of lambdas at the outer level of parallel dispatch (that is, as
|
||||
# the argument to an outer parallel_for, parallel_reduce, or
|
||||
# parallel_scan). This works with non-CUDA execution spaces if C++11
|
||||
# is enabled. It does not currently work with public releases of
|
||||
# CUDA. If that changes, please change the default here to ON if CUDA
|
||||
# and C++11 are ON.
|
||||
IF (${PROJECT_NAME}_ENABLE_CXX11)
|
||||
IF (${PACKAGE_NAME}_ENABLE_CUDA)
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
|
||||
ELSE ()
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
|
||||
ENDIF ()
|
||||
ELSE ()
|
||||
SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
|
||||
ENDIF ()
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
|
||||
"Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
|
||||
${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
|
||||
)
|
||||
|
||||
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
|
||||
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
SET(HEADERS_PUBLIC "")
|
||||
SET(HEADERS_PRIVATE "")
|
||||
SET(SOURCES "")
|
||||
|
||||
FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp)
|
||||
LIST( APPEND HEADERS_PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h )
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_IMPL impl/*.hpp)
|
||||
FILE(GLOB SOURCES_IMPL impl/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} )
|
||||
LIST(APPEND SOURCES ${SOURCES_IMPL} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_THREADS Threads/*.hpp)
|
||||
FILE(GLOB SOURCES_THREADS Threads/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} )
|
||||
LIST(APPEND SOURCES ${SOURCES_THREADS} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp)
|
||||
FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} )
|
||||
LIST(APPEND SOURCES ${SOURCES_OPENMP} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
FILE(GLOB HEADERS_CUDA Cuda/*.hpp)
|
||||
FILE(GLOB SOURCES_CUDA Cuda/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} )
|
||||
LIST(APPEND SOURCES ${SOURCES_CUDA} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
|
||||
FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
|
||||
LIST(APPEND SOURCES ${SOURCES_QTHREAD} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
TRIBITS_ADD_LIBRARY(
|
||||
kokkoscore
|
||||
HEADERS ${HEADERS_PUBLIC}
|
||||
NOINSTALLHEADERS ${HEADERS_PRIVATE}
|
||||
SOURCES ${SOURCES}
|
||||
DEPLIBS
|
||||
)
|
||||
|
||||
|
||||
@ -54,7 +54,59 @@ namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct ViewOperatorBoundsErrorAbort< Kokkos::CudaSpace > {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void apply( const size_t rank
|
||||
, const size_t n0 , const size_t n1
|
||||
, const size_t n2 , const size_t n3
|
||||
, const size_t n4 , const size_t n5
|
||||
, const size_t n6 , const size_t n7
|
||||
, const size_t i0 , const size_t i1
|
||||
, const size_t i2 , const size_t i3
|
||||
, const size_t i4 , const size_t i5
|
||||
, const size_t i6 , const size_t i7 )
|
||||
{
|
||||
const int r =
|
||||
( n0 <= i0 ? 0 :
|
||||
( n1 <= i1 ? 1 :
|
||||
( n2 <= i2 ? 2 :
|
||||
( n3 <= i3 ? 3 :
|
||||
( n4 <= i4 ? 4 :
|
||||
( n5 <= i5 ? 5 :
|
||||
( n6 <= i6 ? 6 : 7 )))))));
|
||||
const size_t n =
|
||||
( n0 <= i0 ? n0 :
|
||||
( n1 <= i1 ? n1 :
|
||||
( n2 <= i2 ? n2 :
|
||||
( n3 <= i3 ? n3 :
|
||||
( n4 <= i4 ? n4 :
|
||||
( n5 <= i5 ? n5 :
|
||||
( n6 <= i6 ? n6 : n7 )))))));
|
||||
const size_t i =
|
||||
( n0 <= i0 ? i0 :
|
||||
( n1 <= i1 ? i1 :
|
||||
( n2 <= i2 ? i2 :
|
||||
( n3 <= i3 ? i3 :
|
||||
( n4 <= i4 ? i4 :
|
||||
( n5 <= i5 ? i5 :
|
||||
( n6 <= i6 ? i6 : i7 )))))));
|
||||
printf("Cuda view array bounds error index %d : FAILED %lu < %lu\n" , r , i , n );
|
||||
Kokkos::Impl::cuda_abort("Cuda view array bounds error");
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
|
||||
// Via reinterpret_case this can be used to support all scalar types of those sizes.
|
||||
// Any other scalar type falls back to either normal reads out of global memory,
|
||||
@ -130,7 +182,6 @@ struct CudaTextureFetch {
|
||||
CudaTextureFetch( const ValueType * const arg_ptr
|
||||
, Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
|
||||
)
|
||||
// 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300
|
||||
: m_obj( record.template attach_texture_object< AliasType >() )
|
||||
, m_ptr( arg_ptr )
|
||||
, m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
|
||||
|
||||
@ -208,9 +208,9 @@ struct CudaParallelLaunch< DriverType , true > {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
else if ( shmem ) {
|
||||
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
|
||||
// Copy functor to constant memory on the device
|
||||
@ -246,9 +246,9 @@ struct CudaParallelLaunch< DriverType , false > {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
|
||||
}
|
||||
else if ( shmem ) {
|
||||
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared ) );
|
||||
} else {
|
||||
cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 );
|
||||
CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 ) );
|
||||
}
|
||||
|
||||
int* lock_array_ptr = lock_array_cuda_space_ptr();
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <algorithm>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
@ -106,6 +107,8 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace {
|
||||
|
||||
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
|
||||
@ -164,6 +167,8 @@ void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
|
||||
texture_object_attach_impl( tracker, type_size, desc );
|
||||
}
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
void CudaSpace::access_error()
|
||||
{
|
||||
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
|
||||
@ -178,6 +183,8 @@ void CudaSpace::access_error( const void * const )
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
|
||||
{
|
||||
return Impl::AllocationTracker( allocator(), size, label);
|
||||
@ -191,6 +198,8 @@ void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & track
|
||||
texture_object_attach_impl( tracker, type_size, desc );
|
||||
}
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
bool CudaUVMSpace::available()
|
||||
{
|
||||
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
|
||||
@ -203,11 +212,15 @@ bool CudaUVMSpace::available()
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
|
||||
{
|
||||
return Impl::AllocationTracker( allocator(), size, label);
|
||||
}
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
@ -301,8 +314,18 @@ attach_texture_object( const unsigned sizeof_alias
|
||||
, void * const alloc_ptr
|
||||
, size_t const alloc_size )
|
||||
{
|
||||
// Only valid for 300 <= __CUDA_ARCH__
|
||||
// otherwise return zero.
|
||||
enum { TEXTURE_BOUND_1D = 1u << 27 };
|
||||
|
||||
if ( ( alloc_ptr == 0 ) || ( sizeof_alias * TEXTURE_BOUND_1D <= alloc_size ) ) {
|
||||
std::ostringstream msg ;
|
||||
msg << "Kokkos::CudaSpace ERROR: Cannot attach texture object to"
|
||||
<< " alloc_ptr(" << alloc_ptr << ")"
|
||||
<< " alloc_size(" << alloc_size << ")"
|
||||
<< " max_size(" << ( sizeof_alias * TEXTURE_BOUND_1D ) << ")" ;
|
||||
std::cerr << msg.str() << std::endl ;
|
||||
std::cerr.flush();
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
::cudaTextureObject_t tex_obj ;
|
||||
|
||||
@ -505,6 +528,133 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
allocate_tracked( const Kokkos::CudaSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<CudaSpace,CudaSpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<CudaUVMSpace,CudaUVMSpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
, const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
if ( ! arg_alloc_size ) return (void *) 0 ;
|
||||
|
||||
SharedAllocationRecord * const r =
|
||||
allocate( arg_space , arg_alloc_label , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
deallocate_tracked( void * const arg_alloc_ptr )
|
||||
{
|
||||
if ( arg_alloc_ptr != 0 ) {
|
||||
SharedAllocationRecord * const r = get_record( arg_alloc_ptr );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
}
|
||||
|
||||
void * SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
SharedAllocationRecord * const r_old = get_record( arg_alloc_ptr );
|
||||
SharedAllocationRecord * const r_new = allocate( r_old->m_space , r_old->get_label() , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<CudaHostPinnedSpace,CudaHostPinnedSpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void > *
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
|
||||
{
|
||||
@ -514,15 +664,17 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr
|
||||
|
||||
#if 0
|
||||
// Copy the header from the allocation
|
||||
SharedAllocationHeader head ;
|
||||
Header head ;
|
||||
|
||||
SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr );
|
||||
Header const * const head_cuda = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ;
|
||||
|
||||
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
|
||||
if ( alloc_ptr ) {
|
||||
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
|
||||
}
|
||||
|
||||
RecordCuda * const record = static_cast< RecordCuda * >( head.m_record );
|
||||
RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ;
|
||||
|
||||
if ( record->m_alloc_ptr != head_cuda ) {
|
||||
if ( ! alloc_ptr || record->m_alloc_ptr != head_cuda ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
@ -548,9 +700,9 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
|
||||
|
||||
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
|
||||
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
|
||||
|
||||
if ( h->m_record->m_alloc_ptr != h ) {
|
||||
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
@ -563,9 +715,9 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void *
|
||||
using Header = SharedAllocationHeader ;
|
||||
using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
|
||||
|
||||
Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
|
||||
Header * const h = alloc_ptr ? reinterpret_cast< Header * >( alloc_ptr ) - 1 : (Header *) 0 ;
|
||||
|
||||
if ( h->m_record->m_alloc_ptr != h ) {
|
||||
if ( ! alloc_ptr || h->m_record->m_alloc_ptr != h ) {
|
||||
Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
|
||||
}
|
||||
|
||||
@ -592,14 +744,25 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
|
||||
head.m_label[0] = 0 ;
|
||||
}
|
||||
|
||||
snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
|
||||
, reinterpret_cast<unsigned long>( r )
|
||||
, reinterpret_cast<unsigned long>( r->m_prev )
|
||||
, reinterpret_cast<unsigned long>( r->m_next )
|
||||
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, reinterpret_cast<uintptr_t>( r )
|
||||
, reinterpret_cast<uintptr_t>( r->m_prev )
|
||||
, reinterpret_cast<uintptr_t>( r->m_next )
|
||||
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
|
||||
, r->m_alloc_size
|
||||
, r->m_count
|
||||
, reinterpret_cast<unsigned long>( r->m_dealloc )
|
||||
, reinterpret_cast<uintptr_t>( r->m_dealloc )
|
||||
, head.m_label
|
||||
);
|
||||
std::cout << buffer ;
|
||||
@ -612,8 +775,19 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail
|
||||
|
||||
Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
|
||||
|
||||
snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n"
|
||||
, reinterpret_cast< unsigned long >( r->data() )
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "Cuda [ 0x%.12lx + %ld ] %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "Cuda [ 0x%.12llx + %ld ] %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, reinterpret_cast< uintptr_t >( r->data() )
|
||||
, r->size()
|
||||
, head.m_label
|
||||
);
|
||||
|
||||
@ -71,7 +71,7 @@ shared_allocation_record( Kokkos::CudaSpace const & arg_space
|
||||
|
||||
DestructFunctor * const functor =
|
||||
reinterpret_cast< DestructFunctor * >(
|
||||
reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) );
|
||||
reinterpret_cast< uintptr_t >( record ) + sizeof(SharedAllocationRecord) );
|
||||
|
||||
new( functor ) DestructFunctor( arg_destruct );
|
||||
|
||||
|
||||
@ -43,6 +43,8 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
@ -56,6 +58,7 @@ namespace Kokkos { namespace Impl {
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
TextureAttribute::TextureAttribute( void * const alloc_ptr
|
||||
, size_t alloc_size
|
||||
, cudaChannelFormatDesc const & desc
|
||||
@ -190,3 +193,6 @@ void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif //KOKKOS_HAVE_CUDA
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
|
||||
@ -46,6 +46,8 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
|
||||
@ -85,7 +87,6 @@ struct TextureAttribute : public AllocatorAttributeBase
|
||||
~TextureAttribute();
|
||||
};
|
||||
|
||||
|
||||
/// class CudaUnmanagedAllocator
|
||||
/// does nothing when deallocate(ptr,size) is called
|
||||
struct CudaUnmanagedAllocator
|
||||
@ -184,4 +185,6 @@ public:
|
||||
|
||||
#endif //KOKKOS_HAVE_CUDA
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
|
||||
|
||||
@ -222,10 +222,14 @@ private:
|
||||
CudaInternal( const CudaInternal & );
|
||||
CudaInternal & operator = ( const CudaInternal & );
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
AllocationTracker m_scratchFlagsTracker;
|
||||
AllocationTracker m_scratchSpaceTracker;
|
||||
AllocationTracker m_scratchUnifiedTracker;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
public:
|
||||
|
||||
@ -482,6 +486,32 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_CUDA_USE_UVM
|
||||
if(!cuda_launch_blocking()) {
|
||||
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
|
||||
std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl;
|
||||
std::cout << " The code must call Cuda::fence() after each kernel" << std::endl;
|
||||
std::cout << " or will likely crash when accessing data on the host." << std::endl;
|
||||
}
|
||||
|
||||
const char * env_force_device_alloc = getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC");
|
||||
bool force_device_alloc;
|
||||
if (env_force_device_alloc == 0) force_device_alloc=false;
|
||||
else force_device_alloc=atoi(env_force_device_alloc)!=0;
|
||||
|
||||
const char * env_visible_devices = getenv("CUDA_VISIBLE_DEVICES");
|
||||
bool visible_devices_one=true;
|
||||
if (env_visible_devices == 0) visible_devices_one=false;
|
||||
|
||||
if(!visible_devices_one && !force_device_alloc) {
|
||||
std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl;
|
||||
std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl;
|
||||
std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl;
|
||||
std::cout << " This could on multi GPU systems lead to severe performance" << std::endl;
|
||||
std::cout << " penalties." << std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_cuda_space();
|
||||
|
||||
@ -501,9 +531,27 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
|
||||
|
||||
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
|
||||
|
||||
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchFlags"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchFlagsCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
|
||||
}
|
||||
|
||||
@ -517,9 +565,26 @@ CudaInternal::scratch_space( const Cuda::size_type size )
|
||||
|
||||
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaSpace()
|
||||
, "InternalScratchSpace"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchSpaceCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return m_scratchSpace ;
|
||||
@ -533,8 +598,26 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
|
||||
|
||||
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
|
||||
|
||||
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
|
||||
, "InternalScratchUnified"
|
||||
, ( sizeof( ScratchGrain ) * m_scratchUnifiedCount ) );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
return m_scratchUnified ;
|
||||
@ -555,10 +638,23 @@ void CudaInternal::finalize()
|
||||
::free( m_stream );
|
||||
}
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
m_scratchSpaceTracker.clear();
|
||||
m_scratchFlagsTracker.clear();
|
||||
m_scratchUnifiedTracker.clear();
|
||||
|
||||
#else
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
|
||||
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchFlags ) );
|
||||
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
|
||||
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
|
||||
|
||||
#endif
|
||||
|
||||
m_cudaDev = -1 ;
|
||||
m_maxWarpCount = 0 ;
|
||||
m_maxBlock = 0 ;
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
|
||||
#ifndef KOKKOS_CUDA_INTERNAL_HPP
|
||||
#define KOKKOS_CUDA_INTERNAL_HPP
|
||||
|
||||
#include<iostream>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
/* only compile this file if CUDA is enabled for Kokkos */
|
||||
@ -53,18 +53,21 @@
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
template<class DriverType, bool Large>
|
||||
struct CudaGetMaxBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra);
|
||||
}
|
||||
|
||||
|
||||
template<class DriverType>
|
||||
int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
|
||||
#if ( CUDA_VERSION < 6050 )
|
||||
return 256;
|
||||
#else
|
||||
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
|
||||
|
||||
int numBlocks;
|
||||
if(Large) {
|
||||
struct CudaGetMaxBlockSize<DriverType,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
int numBlocks;
|
||||
int blockSize=32;
|
||||
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
@ -73,7 +76,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
|
||||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length);
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
@ -83,9 +86,16 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
|
||||
}
|
||||
if(numBlocks>0) return blockSize;
|
||||
else return blockSize/2;
|
||||
} else {
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetMaxBlockSize<DriverType,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
int numBlocks;
|
||||
|
||||
int blockSize=32;
|
||||
int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_local_memory<DriverType>,
|
||||
@ -94,7 +104,7 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
|
||||
|
||||
while (blockSize<1024 && numBlocks>0) {
|
||||
blockSize*=2;
|
||||
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
@ -105,42 +115,58 @@ int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
|
||||
if(numBlocks>0) return blockSize;
|
||||
else return blockSize/2;
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
||||
template<class DriverType, bool Large>
|
||||
struct CudaGetOptBlockSize;
|
||||
|
||||
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra);
|
||||
}
|
||||
|
||||
template<class DriverType>
|
||||
int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
|
||||
#if ( CUDA_VERSION < 6050 )
|
||||
return 256;
|
||||
#else
|
||||
bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
|
||||
struct CudaGetOptBlockSize<DriverType,true> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy=0;
|
||||
int bestBlockSize=0;
|
||||
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy=0;
|
||||
int bestBlockSize=0;
|
||||
|
||||
if(Large) {
|
||||
while(blockSize<1024) {
|
||||
blockSize*=2;
|
||||
|
||||
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
|
||||
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
cuda_parallel_launch_constant_memory<DriverType>,
|
||||
blockSize,
|
||||
sharedmem);
|
||||
if(maxOccupancy < numBlocks*blockSize) {
|
||||
maxOccupancy = numBlocks*blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
maxOccupancy = numBlocks*blockSize;
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return bestBlockSize;
|
||||
}
|
||||
};
|
||||
|
||||
template<class DriverType>
|
||||
struct CudaGetOptBlockSize<DriverType,false> {
|
||||
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) {
|
||||
int blockSize=16;
|
||||
int numBlocks;
|
||||
int sharedmem;
|
||||
int maxOccupancy=0;
|
||||
int bestBlockSize=0;
|
||||
|
||||
while(blockSize<1024) {
|
||||
blockSize*=2;
|
||||
sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize );
|
||||
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
|
||||
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&numBlocks,
|
||||
@ -153,10 +179,9 @@ int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
|
||||
bestBlockSize = blockSize;
|
||||
}
|
||||
}
|
||||
return bestBlockSize;
|
||||
}
|
||||
return bestBlockSize;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -117,7 +117,7 @@ inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
|
||||
|
||||
value = result[0];
|
||||
for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
|
||||
for(int i = 1; (i*step<max_active_thread) && i<STEP_WIDTH; i++)
|
||||
join(value,result[i]);
|
||||
}
|
||||
|
||||
@ -345,8 +345,11 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// '__ffs' = position of the least significant bit set to 1.
|
||||
// 'blockDim.y' is guaranteed to be a power of two so this
|
||||
// is the integral shift value that can replace an integral divide.
|
||||
const unsigned BlockSizeShift = __ffs( blockDim.y ) - 1 ;
|
||||
const unsigned BlockSizeMask = blockDim.y - 1 ;
|
||||
const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
|
||||
|
||||
// Must have power of two thread count
|
||||
if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
|
||||
|
||||
@ -53,6 +53,7 @@
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_CudaSpace.hpp>
|
||||
#include <impl/Kokkos_Shape.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
|
||||
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
|
||||
@ -89,6 +90,8 @@ struct AssertShapeBoundsAbort< CudaSpace >
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
@ -419,6 +422,8 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -45,6 +45,7 @@
|
||||
#define KOKKOS_ARRAY
|
||||
|
||||
#include <type_traits>
|
||||
#include <algorithm>
|
||||
#include <limits>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
529
lib/kokkos/core/src/Kokkos_Complex.hpp
Normal file
529
lib/kokkos/core/src/Kokkos_Complex.hpp
Normal file
@ -0,0 +1,529 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
#ifndef KOKKOS_COMPLEX_HPP
|
||||
#define KOKKOS_COMPLEX_HPP
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <complex>
|
||||
#include <iostream>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/// \class complex
|
||||
/// \brief Partial reimplementation of std::complex that works as the
|
||||
/// result of a Kokkos::parallel_reduce.
|
||||
/// \tparam RealType The type of the real and imaginary parts of the
|
||||
/// complex number. As with std::complex, this is only defined for
|
||||
/// \c float, \c double, and <tt>long double</tt>. The latter is
|
||||
/// currently forbidden in CUDA device kernels.
|
||||
template<class RealType>
|
||||
class complex {
|
||||
private:
|
||||
RealType re_, im_;
|
||||
|
||||
public:
|
||||
//! The type of the real or imaginary parts of this complex number.
|
||||
typedef RealType value_type;
|
||||
|
||||
//! Default constructor (initializes both real and imaginary parts to zero).
|
||||
KOKKOS_INLINE_FUNCTION complex () :
|
||||
re_ (0.0), im_ (0.0)
|
||||
{}
|
||||
|
||||
//! Copy constructor.
|
||||
KOKKOS_INLINE_FUNCTION complex (const complex<RealType>& src) :
|
||||
re_ (src.re_), im_ (src.im_)
|
||||
{}
|
||||
|
||||
//! Copy constructor from volatile.
|
||||
KOKKOS_INLINE_FUNCTION complex (const volatile complex<RealType>& src) :
|
||||
re_ (src.re_), im_ (src.im_)
|
||||
{}
|
||||
|
||||
/// \brief Conversion constructor from std::complex.
|
||||
///
|
||||
/// This constructor cannot be called in a CUDA device function,
|
||||
/// because std::complex's methods and nonmember functions are not
|
||||
/// marked as CUDA device functions.
|
||||
template<class InputRealType>
|
||||
complex (const std::complex<InputRealType>& src) :
|
||||
re_ (std::real (src)), im_ (std::imag (src))
|
||||
{}
|
||||
|
||||
/// \brief Conversion operator to std::complex.
|
||||
///
|
||||
/// This operator cannot be called in a CUDA device function,
|
||||
/// because std::complex's methods and nonmember functions are not
|
||||
/// marked as CUDA device functions.
|
||||
operator std::complex<RealType> () const {
|
||||
return std::complex<RealType> (re_, im_);
|
||||
}
|
||||
|
||||
/// \brief Constructor that takes just the real part, and sets the
|
||||
/// imaginary part to zero.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION complex (const InputRealType& val) :
|
||||
re_ (val), im_ (0.0)
|
||||
{}
|
||||
|
||||
//! Constructor that takes the real and imaginary parts.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
|
||||
re_ (re), im_ (im)
|
||||
{}
|
||||
|
||||
//! Assignment operator.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator= (const complex<InputRealType>& src) {
|
||||
re_ = src.re_;
|
||||
im_ = src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile {
|
||||
re_ = src.re_;
|
||||
im_ = src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
volatile complex<RealType>& operator= (const volatile complex<InputRealType>& src) volatile {
|
||||
re_ = src.re_;
|
||||
im_ = src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator.
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator= (const volatile complex<InputRealType>& src) {
|
||||
re_ = src.re_;
|
||||
im_ = src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator (from a real number).
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator= (const InputRealType& val) {
|
||||
re_ = val;
|
||||
im_ = static_cast<RealType> (0.0);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! Assignment operator (from a real number).
|
||||
template<class InputRealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator= (const InputRealType& val) volatile {
|
||||
re_ = val;
|
||||
im_ = static_cast<RealType> (0.0);
|
||||
}
|
||||
|
||||
/// \brief Assignment operator from std::complex.
|
||||
///
|
||||
/// This constructor cannot be called in a CUDA device function,
|
||||
/// because std::complex's methods and nonmember functions are not
|
||||
/// marked as CUDA device functions.
|
||||
template<class InputRealType>
|
||||
complex<RealType>& operator= (const std::complex<InputRealType>& src) {
|
||||
re_ = std::real (src);
|
||||
im_ = std::imag (src);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//! The imaginary part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION RealType& imag () {
|
||||
return im_;
|
||||
}
|
||||
|
||||
//! The real part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION RealType& real () {
|
||||
return re_;
|
||||
}
|
||||
|
||||
//! The imaginary part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION const RealType imag () const {
|
||||
return im_;
|
||||
}
|
||||
|
||||
//! The real part of this complex number.
|
||||
KOKKOS_INLINE_FUNCTION const RealType real () const {
|
||||
return re_;
|
||||
}
|
||||
|
||||
//! The imaginary part of this complex number (volatile overload).
|
||||
KOKKOS_INLINE_FUNCTION volatile RealType& imag () volatile {
|
||||
return im_;
|
||||
}
|
||||
|
||||
//! The real part of this complex number (volatile overload).
|
||||
KOKKOS_INLINE_FUNCTION volatile RealType& real () volatile {
|
||||
return re_;
|
||||
}
|
||||
|
||||
//! The imaginary part of this complex number (volatile overload).
|
||||
KOKKOS_INLINE_FUNCTION const RealType imag () const volatile {
|
||||
return im_;
|
||||
}
|
||||
|
||||
//! The real part of this complex number (volatile overload).
|
||||
KOKKOS_INLINE_FUNCTION const RealType real () const volatile {
|
||||
return re_;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const complex<RealType>& src) {
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile complex<RealType>& src) volatile {
|
||||
re_ += src.re_;
|
||||
im_ += src.im_;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator += (const RealType& src) {
|
||||
re_ += src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator += (const volatile RealType& src) volatile {
|
||||
re_ += src;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const complex<RealType>& src) {
|
||||
re_ -= src.re_;
|
||||
im_ -= src.im_;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator -= (const RealType& src) {
|
||||
re_ -= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const complex<RealType>& src) {
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
im_ = imagPart;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile complex<RealType>& src) volatile {
|
||||
const RealType realPart = re_ * src.re_ - im_ * src.im_;
|
||||
const RealType imagPart = re_ * src.im_ + im_ * src.re_;
|
||||
re_ = realPart;
|
||||
im_ = imagPart;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator *= (const RealType& src) {
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator *= (const volatile RealType& src) volatile {
|
||||
re_ *= src;
|
||||
im_ *= src;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const complex<RealType>& y) {
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
this->re_ /= s;
|
||||
this->im_ /= s;
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
|
||||
const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
|
||||
const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
|
||||
y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
|
||||
*this = x_scaled * y_conj_scaled;
|
||||
*this /= y_scaled_abs;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>& operator /= (const RealType& src) {
|
||||
re_ /= src;
|
||||
im_ /= src;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
//! Binary + operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
|
||||
}
|
||||
|
||||
//! Unary + operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator + (const complex<RealType>& x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
//! Binary - operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
|
||||
}
|
||||
|
||||
//! Unary - operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator - (const complex<RealType>& x) {
|
||||
return complex<RealType> (-x.real (), -x.imag ());
|
||||
}
|
||||
|
||||
//! Binary * operator for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for std::complex and complex.
|
||||
///
|
||||
/// This function exists because GCC 4.7.2 (and perhaps other
|
||||
/// compilers) are not able to deduce that they can multiply
|
||||
/// std::complex by Kokkos::complex, by first converting std::complex
|
||||
/// to Kokkos::complex.
|
||||
///
|
||||
/// This function cannot be called in a CUDA device function, because
|
||||
/// std::complex's methods and nonmember functions are not marked as
|
||||
/// CUDA device functions.
|
||||
template<class RealType>
|
||||
complex<RealType>
|
||||
operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
|
||||
x.real () * y.imag () + x.imag () * y.real ());
|
||||
}
|
||||
|
||||
/// \brief Binary * operator for RealType times complex.
|
||||
///
|
||||
/// This function exists because the compiler doesn't know that
|
||||
/// RealType and complex<RealType> commute with respect to operator*.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator * (const RealType& x, const complex<RealType>& y) {
|
||||
return complex<RealType> (x * y.real (), x * y.imag ());
|
||||
}
|
||||
|
||||
|
||||
//! Imaginary part of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType imag (const complex<RealType>& x) {
|
||||
return x.imag ();
|
||||
}
|
||||
|
||||
//! Real part of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType real (const complex<RealType>& x) {
|
||||
return x.real ();
|
||||
}
|
||||
|
||||
//! Absolute value (magnitude) of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
RealType abs (const complex<RealType>& x) {
|
||||
// FIXME (mfh 31 Oct 2014) Scale to avoid unwarranted overflow.
|
||||
return ::sqrt (real (x) * real (x) + imag (x) * imag (x));
|
||||
}
|
||||
|
||||
//! Conjugate of a complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType> conj (const complex<RealType>& x) {
|
||||
return complex<RealType> (real (x), -imag (x));
|
||||
}
|
||||
|
||||
|
||||
//! Binary operator / for complex and real numbers
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType1>
|
||||
operator / (const complex<RealType1>& x, const RealType2& y) {
|
||||
return complex<RealType1> (real (x) / y, imag (x) / y);
|
||||
}
|
||||
|
||||
//! Binary operator / for complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
complex<RealType>
|
||||
operator / (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
|
||||
// If the real part is +/-Inf and the imaginary part is -/+Inf,
|
||||
// this won't change the result.
|
||||
const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
|
||||
|
||||
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
|
||||
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
|
||||
// because y/s is NaN.
|
||||
if (s == 0.0) {
|
||||
return complex<RealType> (real (x) / s, imag (x) / s);
|
||||
}
|
||||
else {
|
||||
const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
|
||||
const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
|
||||
const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
|
||||
imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
|
||||
complex<RealType> result = x_scaled * y_conj_scaled;
|
||||
result /= y_scaled_abs;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
//! Equality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) == real (y) && imag (x) == imag (y);
|
||||
}
|
||||
|
||||
//! Equality operator for std::complex and Kokkos::complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) == real (y) && std::imag (x) == imag (y);
|
||||
}
|
||||
|
||||
//! Equality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
|
||||
}
|
||||
|
||||
//! Equality operator for real and complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator == (const RealType& x, const complex<RealType>& y) {
|
||||
return y == x;
|
||||
}
|
||||
|
||||
//! Inequality operator for two complex numbers.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
|
||||
return real (x) != real (y) || imag (x) != imag (y);
|
||||
}
|
||||
|
||||
//! Inequality operator for std::complex and Kokkos::complex.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
|
||||
return std::real (x) != real (y) || std::imag (x) != imag (y);
|
||||
}
|
||||
|
||||
//! Inequality operator for complex and real number.
|
||||
template<class RealType1, class RealType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const complex<RealType1>& x, const RealType2& y) {
|
||||
return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
|
||||
}
|
||||
|
||||
//! Inequality operator for real and complex number.
|
||||
template<class RealType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
bool operator != (const RealType& x, const complex<RealType>& y) {
|
||||
return y != x;
|
||||
}
|
||||
|
||||
template<class RealType>
|
||||
std::ostream& operator << (std::ostream& os, const complex<RealType>& x) {
|
||||
const std::complex<RealType> x_std (Kokkos::real (x), Kokkos::imag (x));
|
||||
os << x_std;
|
||||
return os;
|
||||
}
|
||||
|
||||
template<class RealType>
|
||||
std::ostream& operator >> (std::ostream& os, complex<RealType>& x) {
|
||||
std::complex<RealType> x_std;
|
||||
os >> x_std;
|
||||
x = x_std; // only assigns on success of above
|
||||
return os;
|
||||
}
|
||||
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // KOKKOS_COMPLEX_HPP
|
||||
@ -49,22 +49,22 @@
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#if defined( KOKKOS_HAVE_SERIAL )
|
||||
#include <Kokkos_Serial.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_HAVE_OPENMP )
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_HAVE_SERIAL )
|
||||
#include <Kokkos_Serial.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_HAVE_PTHREAD )
|
||||
#include <Kokkos_Threads.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_HAVE_CUDA )
|
||||
#include <Kokkos_Cuda.hpp>
|
||||
#endif
|
||||
|
||||
#include <Kokkos_Pair.hpp>
|
||||
#include <Kokkos_Array.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
@ -72,10 +72,8 @@
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_hwloc.hpp>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
////#include <Kokkos_Complex.hpp>
|
||||
#include <Kokkos_Complex.hpp>
|
||||
#endif
|
||||
|
||||
|
||||
@ -107,9 +105,70 @@ void finalize_all();
|
||||
|
||||
void fence();
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/* Allocate memory from a memory space.
|
||||
* The allocation is tracked in Kokkos memory tracking system, so
|
||||
* leaked memory can be identified.
|
||||
*/
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void * kokkos_malloc( const std::string & arg_alloc_label
|
||||
, const size_t arg_alloc_size )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
return Impl::SharedAllocationRecord< MemorySpace >::
|
||||
allocate_tracked( MemorySpace() , arg_alloc_label , arg_alloc_size );
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void * kokkos_malloc( const size_t arg_alloc_size )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
return Impl::SharedAllocationRecord< MemorySpace >::
|
||||
allocate_tracked( MemorySpace() , "no-label" , arg_alloc_size );
|
||||
}
|
||||
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void kokkos_free( void * arg_alloc )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
return Impl::SharedAllocationRecord< MemorySpace >::
|
||||
deallocate_tracked( arg_alloc );
|
||||
}
|
||||
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
return Impl::SharedAllocationRecord< MemorySpace >::
|
||||
reallocate_tracked( arg_alloc , arg_alloc_size );
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
using Kokkos::Experimental::kokkos_malloc ;
|
||||
using Kokkos::Experimental::kokkos_realloc ;
|
||||
using Kokkos::Experimental::kokkos_free ;
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
@ -161,7 +220,10 @@ void kokkos_free(const void* ptr) {
|
||||
|
||||
|
||||
template< class Arg = DefaultExecutionSpace>
|
||||
const void* kokkos_realloc(const void* old_ptr, size_t size) {
|
||||
void* kokkos_realloc(const void* old_ptr, size_t size) {
|
||||
if(old_ptr == NULL)
|
||||
return kokkos_malloc<Arg>(size);
|
||||
|
||||
typedef typename Arg::memory_space MemorySpace;
|
||||
typedef typename MemorySpace::allocator allocator;
|
||||
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
|
||||
@ -172,64 +234,11 @@ const void* kokkos_realloc(const void* old_ptr, size_t size) {
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void * kokkos_malloc( const size_t arg_alloc_size )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
|
||||
|
||||
RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
|
||||
|
||||
RecordBase::increment( r );
|
||||
|
||||
return r->data();
|
||||
}
|
||||
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void kokkos_free( void * arg_alloc )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
|
||||
|
||||
RecordHost * const r = RecordHost::get_record( arg_alloc );
|
||||
|
||||
RecordBase::decrement( r );
|
||||
}
|
||||
|
||||
template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
|
||||
inline
|
||||
void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
{
|
||||
typedef typename Space::memory_space MemorySpace ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void > RecordBase ;
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void > RecordHost ;
|
||||
|
||||
RecordHost * const r_old = RecordHost::get_record( arg_alloc );
|
||||
RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
|
||||
|
||||
Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data()
|
||||
, std::min( r_old->size() , r_new->size() ) );
|
||||
|
||||
RecordBase::increment( r_new );
|
||||
RecordBase::decrement( r_old );
|
||||
|
||||
return r_new->data();
|
||||
}
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -50,6 +50,22 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
struct AUTO_t {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr const AUTO_t & operator()() const { return *this ; }
|
||||
};
|
||||
|
||||
namespace {
|
||||
/**\brief Token to indicate that a parameter's value is to be automatically selected */
|
||||
constexpr AUTO_t AUTO = Kokkos::AUTO_t();
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// Forward declarations for class inter-relationships
|
||||
@ -58,6 +74,12 @@ namespace Kokkos {
|
||||
|
||||
class HostSpace ; ///< Memory space for main process and CPU execution spaces
|
||||
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
namespace Experimental {
|
||||
class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_HAVE_SERIAL )
|
||||
class Serial ; ///< Execution space main process on CPU
|
||||
#endif // defined( KOKKOS_HAVE_SERIAL )
|
||||
@ -162,9 +184,15 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
void fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
|
||||
|
||||
|
||||
@ -75,6 +75,10 @@ public:
|
||||
|
||||
typedef unsigned int size_type ;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Impl::CudaMallocAllocator allocator;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
@ -96,6 +100,8 @@ public:
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaSpace();
|
||||
@ -103,10 +109,10 @@ public:
|
||||
CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
|
||||
~CudaSpace() = default ;
|
||||
|
||||
/**\brief Allocate memory in the cuda space */
|
||||
/**\brief Allocate untracked memory in the cuda space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate memory in the cuda space */
|
||||
/**\brief Deallocate untracked memory in the cuda space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
@ -162,6 +168,10 @@ public:
|
||||
/** \brief If UVM capability is available */
|
||||
static bool available();
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Impl::CudaUVMAllocator allocator;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
@ -182,6 +192,9 @@ public:
|
||||
, ::cudaChannelFormatDesc const & desc
|
||||
);
|
||||
#endif
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaUVMSpace();
|
||||
@ -189,10 +202,10 @@ public:
|
||||
CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
|
||||
~CudaUVMSpace() = default ;
|
||||
|
||||
/**\brief Allocate memory in the cuda space */
|
||||
/**\brief Allocate untracked memory in the cuda space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate memory in the cuda space */
|
||||
/**\brief Deallocate untracked memory in the cuda space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
@ -223,6 +236,9 @@ public:
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
typedef unsigned int size_type ;
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Impl::CudaHostAllocator allocator ;
|
||||
|
||||
@ -234,6 +250,8 @@ public:
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
CudaHostPinnedSpace();
|
||||
@ -241,10 +259,10 @@ public:
|
||||
CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
|
||||
~CudaHostPinnedSpace() = default ;
|
||||
|
||||
/**\brief Allocate memory in the cuda space */
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate memory in the cuda space */
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
@ -631,8 +649,24 @@ public:
|
||||
|
||||
static SharedAllocationRecord * allocate( const Kokkos::CudaSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::CudaSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
template< typename AliasType >
|
||||
inline
|
||||
@ -660,8 +694,6 @@ public:
|
||||
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
|
||||
}
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
@ -704,6 +736,24 @@ public:
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::CudaUVMSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
|
||||
template< typename AliasType >
|
||||
inline
|
||||
::cudaTextureObject_t attach_texture_object()
|
||||
@ -731,8 +781,6 @@ public:
|
||||
return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
|
||||
}
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
@ -772,6 +820,21 @@ public:
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
);
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
|
||||
@ -78,8 +78,9 @@ template< class Arg0 = void , class Arg1 = void , class Arg2 = void
|
||||
, class ExecSpace =
|
||||
// The first argument is the execution space,
|
||||
// otherwise use the default execution space.
|
||||
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
|
||||
, Kokkos::DefaultExecutionSpace >::type
|
||||
typename std::conditional
|
||||
< Impl::is_execution_space< Arg0 >::value , Arg0
|
||||
, Kokkos::DefaultExecutionSpace >::type
|
||||
>
|
||||
class RangePolicy {
|
||||
private:
|
||||
@ -117,8 +118,8 @@ private:
|
||||
) >::value };
|
||||
|
||||
// The work argument tag is the first or second argument
|
||||
typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
|
||||
typename Impl::if_c< Arg1_WorkTag , Arg1 , void
|
||||
typedef typename std::conditional< Arg0_WorkTag , Arg0 ,
|
||||
typename std::conditional< Arg1_WorkTag , Arg1 , void
|
||||
>::type >::type
|
||||
WorkTag ;
|
||||
|
||||
@ -128,17 +129,18 @@ private:
|
||||
unsigned(DefaultIntValue) ))) };
|
||||
|
||||
// Only accept the integral type if the blocking is a power of two
|
||||
typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
|
||||
typename Impl::if_c< Arg0_IntType , Arg0 ,
|
||||
typename Impl::if_c< Arg1_IntType , Arg1 ,
|
||||
typename Impl::if_c< Arg2_IntType , Arg2 ,
|
||||
typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
|
||||
typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
|
||||
typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
|
||||
DefaultIntType
|
||||
>::type >::type >::type
|
||||
>::type >::type >::type
|
||||
>::type
|
||||
static_assert( Impl::is_integral_power_of_two( Granularity )
|
||||
, "RangePolicy blocking granularity must be power of two" );
|
||||
|
||||
typedef typename std::conditional< Arg0_IntType , Arg0 ,
|
||||
typename std::conditional< Arg1_IntType , Arg1 ,
|
||||
typename std::conditional< Arg2_IntType , Arg2 ,
|
||||
typename std::conditional< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
|
||||
typename std::conditional< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
|
||||
typename std::conditional< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
|
||||
DefaultIntType
|
||||
>::type >::type >::type
|
||||
>::type >::type >::type
|
||||
IntType ;
|
||||
|
||||
enum { GranularityMask = IntType(Granularity) - 1 };
|
||||
@ -187,8 +189,8 @@ public:
|
||||
* Typically used to partition a range over a group of threads.
|
||||
*/
|
||||
struct WorkRange {
|
||||
typedef RangePolicy::work_tag work_tag ;
|
||||
typedef RangePolicy::member_type member_type ;
|
||||
typedef typename RangePolicy::work_tag work_tag ;
|
||||
typedef typename RangePolicy::member_type member_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
|
||||
KOKKOS_INLINE_FUNCTION member_type end() const { return m_end ; }
|
||||
@ -233,6 +235,38 @@ public:
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/** \brief Scratch memory request accepting per team and per thread value
|
||||
*
|
||||
* An instance of this class can be given as the last argument to a
|
||||
* TeamPolicy constructor. It sets the amount of user requested shared
|
||||
* memory for the team.
|
||||
*/
|
||||
|
||||
template< class MemorySpace >
|
||||
class TeamScratchRequest {
|
||||
size_t m_per_team;
|
||||
size_t m_per_thread;
|
||||
|
||||
public:
|
||||
TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
|
||||
m_per_team(per_team_), m_per_thread(per_thread_) {
|
||||
}
|
||||
|
||||
size_t per_team() const {
|
||||
return m_per_team;
|
||||
}
|
||||
size_t per_thread() const {
|
||||
return m_per_thread;
|
||||
}
|
||||
size_t total(const size_t team_size) const {
|
||||
return m_per_team + m_per_thread * team_size;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
/** \brief Execution policy for parallel work over a league of teams of threads.
|
||||
*
|
||||
* The work functor is called for each thread of each team such that
|
||||
@ -258,8 +292,9 @@ template< class Arg0 = void
|
||||
, class ExecSpace =
|
||||
// If the first argument is not an execution
|
||||
// then use the default execution space.
|
||||
typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
|
||||
, Kokkos::DefaultExecutionSpace >::type
|
||||
typename std::conditional
|
||||
< Impl::is_execution_space< Arg0 >::value , Arg0
|
||||
, Kokkos::DefaultExecutionSpace >::type
|
||||
>
|
||||
class TeamPolicy {
|
||||
private:
|
||||
@ -268,7 +303,7 @@ private:
|
||||
enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
|
||||
enum { ArgOption_OK = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
|
||||
|
||||
typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
|
||||
typedef typename std::conditional< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
|
||||
|
||||
public:
|
||||
|
||||
@ -300,10 +335,20 @@ public:
|
||||
static int team_size_recommended( const FunctorType & , const int&);
|
||||
//----------------------------------------
|
||||
/** \brief Construct policy with the given instance of the execution space */
|
||||
TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
|
||||
TeamPolicy( const execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
|
||||
|
||||
TeamPolicy( const execution_space & , int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
|
||||
|
||||
/** \brief Construct policy with the default instance of the execution space */
|
||||
TeamPolicy( int league_size_request , int team_size_request );
|
||||
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 );
|
||||
|
||||
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
|
||||
|
||||
/** \brief The actual league size (number of teams) of the policy.
|
||||
*
|
||||
|
||||
327
lib/kokkos/core/src/Kokkos_HBWSpace.hpp
Normal file
327
lib/kokkos/core/src/Kokkos_HBWSpace.hpp
Normal file
@ -0,0 +1,327 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HBWSPACE_HPP
|
||||
#define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <impl/Kokkos_HBWAllocators.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
///
|
||||
/// Arbitrary atomics are implemented using a hash table of locks
|
||||
/// where the hash value is derived from the address of the
|
||||
/// object for which an atomic operation is performed.
|
||||
/// This function initializes the locks to zero (unset).
|
||||
void init_lock_array_hbw_space();
|
||||
|
||||
/// \brief Aquire a lock for the address
|
||||
///
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
bool lock_address_hbw_space(void* ptr);
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
/// This function releases the lock for the hash value derived
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
void unlock_address_hbw_space(void* ptr);
|
||||
|
||||
} // namespace Impl
|
||||
} // neamspace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
/// \class HBWSpace
|
||||
/// \brief Memory management for host memory.
|
||||
///
|
||||
/// HBWSpace is a memory space that governs host memory. "Host"
|
||||
/// memory means the usual CPU-accessible memory.
|
||||
class HBWSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef HBWSpace memory_space ;
|
||||
typedef size_t size_type ;
|
||||
|
||||
/// \typedef execution_space
|
||||
/// \brief Default execution space for this memory space.
|
||||
///
|
||||
/// Every memory space has a default execution space. This is
|
||||
/// useful for things like initializing a View (which happens in
|
||||
/// parallel using the View's default execution space).
|
||||
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
#elif defined( KOKKOS_HAVE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
#elif defined( KOKKOS_HAVE_PTHREAD )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
#elif defined( KOKKOS_HAVE_SERIAL )
|
||||
typedef Kokkos::Serial execution_space ;
|
||||
#else
|
||||
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
|
||||
#endif
|
||||
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Impl::HBWMallocAllocator allocator ;
|
||||
|
||||
/** \brief Allocate a contiguous block of memory.
|
||||
*
|
||||
* The input label is associated with the block of memory.
|
||||
* The block of memory is tracked via reference counting where
|
||||
* allocation gives it a reference count of one.
|
||||
*/
|
||||
static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HBWSpace */
|
||||
static int in_parallel();
|
||||
|
||||
static void register_in_parallel( int (*)() );
|
||||
|
||||
/*--------------------------------*/
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
HBWSpace();
|
||||
HBWSpace( const HBWSpace & rhs ) = default ;
|
||||
HBWSpace & operator = ( const HBWSpace & ) = default ;
|
||||
~HBWSpace() = default ;
|
||||
|
||||
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
|
||||
|
||||
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
|
||||
|
||||
explicit
|
||||
HBWSpace( const AllocationMechanism & );
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech ;
|
||||
|
||||
friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
{
|
||||
private:
|
||||
|
||||
friend Kokkos::Experimental::HBWSpace ;
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
/**\brief Root record for tracked allocations from this HBWSpace instance */
|
||||
static RecordBase s_root_record ;
|
||||
|
||||
const Kokkos::Experimental::HBWSpace m_space ;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() = default ;
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
, const RecordBase::function_type arg_dealloc = & deallocate
|
||||
);
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
std::string get_label() const
|
||||
{
|
||||
return std::string( RecordBase::head()->m_label );
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
SharedAllocationRecord * allocate( const Kokkos::Experimental::HBWSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size
|
||||
)
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
|
||||
#else
|
||||
return (SharedAllocationRecord *) 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::Experimental::HBWSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_HBWSPACE_HPP */
|
||||
|
||||
@ -128,6 +128,8 @@ public:
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
|
||||
typedef Impl::PageAlignedAllocator allocator ;
|
||||
@ -143,6 +145,8 @@ public:
|
||||
*/
|
||||
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HostSpace */
|
||||
static int in_parallel();
|
||||
@ -164,10 +168,10 @@ public:
|
||||
explicit
|
||||
HostSpace( const AllocationMechanism & );
|
||||
|
||||
/**\brief Allocate memory in the host space */
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
|
||||
/**\brief Deallocate memory in the host space */
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
|
||||
@ -239,6 +243,21 @@ public:
|
||||
#endif
|
||||
}
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::HostSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Reallocate tracked memory in the space */
|
||||
static
|
||||
void * reallocate_tracked( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size );
|
||||
|
||||
/**\brief Deallocate tracked memory in the space */
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
|
||||
@ -157,10 +157,15 @@ struct LayoutStride {
|
||||
/// both tile dimensions are powers of two, Kokkos can optimize
|
||||
/// further.
|
||||
template < unsigned ArgN0 , unsigned ArgN1 ,
|
||||
bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
|
||||
Impl::is_power_of_two<ArgN1>::value )
|
||||
bool IsPowerOfTwo = ( Impl::is_integral_power_of_two(ArgN0) &&
|
||||
Impl::is_integral_power_of_two(ArgN1) )
|
||||
>
|
||||
struct LayoutTileLeft {
|
||||
|
||||
static_assert( Impl::is_integral_power_of_two(ArgN0) &&
|
||||
Impl::is_integral_power_of_two(ArgN1)
|
||||
, "LayoutTileLeft must be given power-of-two tile dimensions" );
|
||||
|
||||
//! Tag this class as a kokkos array layout
|
||||
typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
|
||||
|
||||
|
||||
@ -416,5 +416,11 @@
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
|
||||
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
|
||||
#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
|
||||
#define KOKKOS_POSIX_MEMALIGN_AVAILABLE 1
|
||||
#endif
|
||||
#endif
|
||||
#endif /* #ifndef KOKKOS_MACROS_HPP */
|
||||
|
||||
|
||||
@ -101,9 +101,9 @@ namespace Impl {
|
||||
*/
|
||||
enum { MEMORY_ALIGNMENT =
|
||||
#if defined( KOKKOS_MEMORY_ALIGNMENT )
|
||||
( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
|
||||
( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) )
|
||||
#else
|
||||
( 1 << Kokkos::Impl::power_of_two< 128 >::value )
|
||||
( 1 << Kokkos::Impl::integral_power_of_two( 128 ) )
|
||||
#endif
|
||||
, MEMORY_ALIGNMENT_THRESHOLD = 4
|
||||
};
|
||||
|
||||
@ -53,6 +53,9 @@
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
#include <Kokkos_HBWSpace.hpp>
|
||||
#endif
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
@ -72,12 +75,16 @@ public:
|
||||
|
||||
//! Tag this class as a kokkos execution space
|
||||
typedef OpenMP execution_space ;
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
typedef Experimental::HBWSpace memory_space ;
|
||||
#else
|
||||
typedef HostSpace memory_space ;
|
||||
#endif
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
|
||||
typedef LayoutRight array_layout ;
|
||||
typedef HostSpace::size_type size_type ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
|
||||
typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
|
||||
|
||||
|
||||
@ -207,8 +207,12 @@ void parallel_for( const ExecPolicy & policy
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelFor< FunctorType , ExecPolicy > closure( functor , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
Kokkos::Experimental::endParallelFor(kpID);
|
||||
@ -235,7 +239,11 @@ void parallel_for( const size_t work_count
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelFor< FunctorType , policy > closure( functor , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -333,7 +341,11 @@ void parallel_reduce( const ExecPolicy & policy
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -376,7 +388,11 @@ void parallel_reduce( const size_t work_count
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -394,7 +410,7 @@ void parallel_reduce( const ExecPolicy & policy
|
||||
, const ViewType & result_view
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if<
|
||||
( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
|
||||
( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
|
||||
#endif
|
||||
@ -408,7 +424,11 @@ void parallel_reduce( const ExecPolicy & policy
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -465,7 +485,11 @@ void parallel_reduce( const ExecPolicy & policy
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -482,7 +506,7 @@ void parallel_reduce( const size_t work_count
|
||||
, const FunctorType & functor
|
||||
, const ViewType & result_view
|
||||
, const std::string& str = ""
|
||||
, typename Impl::enable_if<( Impl::is_view<ViewType>::value
|
||||
, typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
&& ! Impl::is_same<
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
|
||||
@ -503,7 +527,11 @@ void parallel_reduce( const size_t work_count
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -564,7 +592,11 @@ void parallel_reduce( const size_t work_count
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -813,7 +845,11 @@ void parallel_scan( const ExecutionPolicy & policy
|
||||
}
|
||||
#endif
|
||||
|
||||
Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelScan< FunctorType , ExecutionPolicy > closure( functor , policy );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
@ -842,7 +878,11 @@ void parallel_scan( const size_t work_count
|
||||
}
|
||||
#endif
|
||||
|
||||
(void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
|
||||
Impl::ParallelScan< FunctorType , policy > closure( functor , policy(0,work_count) );
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
|
||||
closure.execute();
|
||||
|
||||
#ifdef KOKKOSP_ENABLE_PROFILING
|
||||
if(Kokkos::Experimental::profileLibraryLoaded()) {
|
||||
|
||||
@ -151,7 +151,7 @@ public:
|
||||
static void finalize() {}
|
||||
|
||||
//! Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool detail = false ) {}
|
||||
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
@ -295,6 +295,7 @@ class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
|
||||
private:
|
||||
|
||||
const int m_league_size ;
|
||||
const int m_scratch_size ;
|
||||
|
||||
public:
|
||||
|
||||
@ -326,15 +327,55 @@ public:
|
||||
|
||||
inline int team_size() const { return 1 ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_scratch_size ; }
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, int /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size( league_size_request )
|
||||
{ (void) vector_length_request; }
|
||||
, m_scratch_size ( 0 )
|
||||
{}
|
||||
|
||||
TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size( league_size_request )
|
||||
{ (void) vector_length_request; }
|
||||
, m_scratch_size ( 0 )
|
||||
{}
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, int /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size( league_size_request )
|
||||
, m_scratch_size ( 0 )
|
||||
{}
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size( league_size_request )
|
||||
, m_scratch_size ( 0 )
|
||||
{}
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, int /* team_size_request */
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_league_size(league_size_request)
|
||||
, m_scratch_size(scratch_request.total(1))
|
||||
{}
|
||||
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_league_size(league_size_request)
|
||||
, m_scratch_size(scratch_request.total(1))
|
||||
{}
|
||||
|
||||
typedef Impl::SerialTeamMember member_type ;
|
||||
};
|
||||
@ -346,53 +387,69 @@ public:
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Parallel patterns for Kokkos::Serial with RangePolicy */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
|
||||
|
||||
public:
|
||||
// work tag is void
|
||||
template< class PType >
|
||||
inline
|
||||
ParallelFor( typename Impl::enable_if<
|
||||
( Impl::is_same< PType , Policy >::value &&
|
||||
Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy )
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
{
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( i );
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i );
|
||||
}
|
||||
}
|
||||
|
||||
// work tag is non-void
|
||||
template< class PType >
|
||||
inline
|
||||
ParallelFor( typename Impl::enable_if<
|
||||
( Impl::is_same< PType , Policy >::value &&
|
||||
! Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy )
|
||||
template< class TagType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
{
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i );
|
||||
const TagType t{} ;
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{ this-> template exec< typename Policy::work_tag >(); }
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
|
||||
>
|
||||
{
|
||||
public:
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
@ -401,123 +458,136 @@ public:
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// Work tag is void
|
||||
template< class ViewType , class PType >
|
||||
ParallelReduce( typename Impl::enable_if<
|
||||
( Impl::is_view< ViewType >::value &&
|
||||
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
|
||||
Impl::is_same< PType , Policy >::value &&
|
||||
Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy
|
||||
, const ViewType & result
|
||||
)
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
pointer_type result_ptr = result.ptr_on_device();
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
if ( ! result_ptr ) {
|
||||
result_ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i , update );
|
||||
}
|
||||
|
||||
reference_type update = ValueInit::init( functor , result_ptr );
|
||||
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( i , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
}
|
||||
|
||||
// Work tag is non-void
|
||||
template< class ViewType , class PType >
|
||||
ParallelReduce( typename Impl::enable_if<
|
||||
( Impl::is_view< ViewType >::value &&
|
||||
Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
|
||||
Impl::is_same< PType , Policy >::value &&
|
||||
! Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy
|
||||
, const ViewType & result
|
||||
)
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
pointer_type result_ptr = result.ptr_on_device();
|
||||
const TagType t{} ;
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
if ( ! result_ptr ) {
|
||||
result_ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i , update );
|
||||
}
|
||||
|
||||
typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
}
|
||||
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update );
|
||||
}
|
||||
public:
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
|
||||
|
||||
static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
|
||||
}
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
|
||||
|
||||
public:
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
// work tag is void
|
||||
template< class PType >
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
ParallelScan( typename Impl::enable_if<
|
||||
( Impl::is_same< PType , Policy >::value &&
|
||||
Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy )
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
pointer_type result_ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
reference_type update = ValueInit::init( functor , result_ptr );
|
||||
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( i , update , true );
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i , update , true );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
|
||||
}
|
||||
|
||||
// work tag is non-void
|
||||
template< class PType >
|
||||
template< class TagType >
|
||||
inline
|
||||
ParallelScan( typename Impl::enable_if<
|
||||
( Impl::is_same< PType , Policy >::value &&
|
||||
! Impl::is_same< typename PType::work_tag , void >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const PType & policy )
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
pointer_type result_ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
|
||||
const TagType t{} ;
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
reference_type update = ValueInit::init( functor , result_ptr );
|
||||
|
||||
const typename PType::member_type e = policy.end();
|
||||
for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update , true );
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i , update , true );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
|
||||
this-> template exec< WorkTag >( ptr );
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -525,112 +595,157 @@ public:
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Parallel patterns for Kokkos::Serial with TeamPolicy */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const int m_league ;
|
||||
const int m_shared ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member )
|
||||
{ functor( member ); }
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member )
|
||||
{ functor( TagType() , member ); }
|
||||
|
||||
public:
|
||||
|
||||
ParallelFor( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
{
|
||||
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
|
||||
|
||||
Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
|
||||
|
||||
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
|
||||
ParallelFor::template driver< typename Policy::work_tag >
|
||||
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
|
||||
// functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( Member(ileague,m_league,m_shared) );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( t , Member(ileague,m_league,m_shared) );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
|
||||
this-> template exec< typename Policy::work_tag >();
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
|
||||
|
||||
public:
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
private:
|
||||
const FunctorType m_functor ;
|
||||
const int m_league ;
|
||||
const int m_shared ;
|
||||
pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member
|
||||
, reference_type update )
|
||||
{ functor( member , update ); }
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( Member(ileague,m_league,m_shared) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member
|
||||
, reference_type update )
|
||||
{ functor( TagType() , member , update ); }
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( t , Member(ileague,m_league,m_shared) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , TagType >::
|
||||
final( m_functor , ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & functor
|
||||
, const Policy & policy
|
||||
, const ViewType & result
|
||||
)
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
const int reduce_size = ValueTraits::value_size( functor );
|
||||
const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
|
||||
void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( m_functor ) , m_shared );
|
||||
|
||||
const pointer_type result_ptr =
|
||||
result.ptr_on_device() ? result.ptr_on_device()
|
||||
: (pointer_type) scratch_reduce ;
|
||||
|
||||
reference_type update = ValueInit::init( functor , result_ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
|
||||
ParallelReduce::template driver< typename Policy::work_tag >
|
||||
( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result
|
||||
)
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
|
||||
|
||||
static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
@ -739,8 +854,6 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Ser
|
||||
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
@ -764,8 +877,6 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Ser
|
||||
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
|
||||
}
|
||||
|
||||
#endif // KOKKOS_HAVE_CXX11
|
||||
|
||||
} //namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -47,11 +47,12 @@
|
||||
#include <type_traits>
|
||||
#include <string>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
|
||||
#include <impl/Kokkos_StaticAssert.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Shape.hpp>
|
||||
@ -444,14 +445,14 @@ template< class DataType ,
|
||||
typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
|
||||
class View ;
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class C >
|
||||
struct is_view : public bool_< false > {};
|
||||
struct is_view : public Impl::bool_< false > {};
|
||||
|
||||
template< class D , class A1 , class A2 , class A3 , class S >
|
||||
struct is_view< View< D , A1 , A2 , A3 , S > > : public bool_< true > {};
|
||||
struct is_view< View< D , A1 , A2 , A3 , S > > : public Impl::bool_< true > {};
|
||||
|
||||
namespace Impl {
|
||||
using Kokkos::is_view ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -952,33 +953,37 @@ public:
|
||||
Impl::ViewError::scalar_operator_called_from_non_scalar_view >
|
||||
if_scalar_operator ;
|
||||
|
||||
typedef Impl::if_c< traits::rank == 0 ,
|
||||
reference_type ,
|
||||
Impl::ViewError::scalar_operator_called_from_non_scalar_view >
|
||||
if_scalar_operator_return ;
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const View & operator = ( const typename if_scalar_operator::type & rhs ) const
|
||||
{
|
||||
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
|
||||
*m_ptr_on_device = if_scalar_operator::select( rhs );
|
||||
m_ptr_on_device[ 0 ] = if_scalar_operator::select( rhs );
|
||||
return *this ;
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
operator typename if_scalar_operator::type & () const
|
||||
operator typename if_scalar_operator_return::type () const
|
||||
{
|
||||
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
|
||||
return if_scalar_operator::select( *m_ptr_on_device );
|
||||
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename if_scalar_operator::type & operator()() const
|
||||
typename if_scalar_operator_return::type operator()() const
|
||||
{
|
||||
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
|
||||
return if_scalar_operator::select( *m_ptr_on_device );
|
||||
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
|
||||
}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename if_scalar_operator::type & operator*() const
|
||||
typename if_scalar_operator_return::type operator*() const
|
||||
{
|
||||
KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
|
||||
return if_scalar_operator::select( *m_ptr_on_device );
|
||||
return if_scalar_operator_return::select( m_ptr_on_device[ 0 ] );
|
||||
}
|
||||
|
||||
//------------------------------------
|
||||
@ -1849,6 +1854,8 @@ void resize( View<T,L,D,M,S> & v ,
|
||||
|
||||
Impl::ViewRemap< view_type , view_type >( v_resized , v );
|
||||
|
||||
view_type::execution_space::fence();
|
||||
|
||||
v = v_resized ;
|
||||
}
|
||||
|
||||
@ -2092,27 +2099,10 @@ struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <KokkosExp_View.hpp>
|
||||
|
||||
#else
|
||||
|
||||
// Must define before includng <impl/Kokkos_ViewOffset.hpp>
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
struct ALL_t ;
|
||||
}
|
||||
}
|
||||
using ALL = Experimental::Impl::ALL_t ;
|
||||
}
|
||||
|
||||
#include <impl/Kokkos_ViewOffset.hpp>
|
||||
#include <impl/Kokkos_ViewSupport.hpp>
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
#include <KokkosExp_View.hpp>
|
||||
|
||||
#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -109,7 +109,7 @@ unsigned thread_mapping( const char * const label ,
|
||||
/** \brief Query core-coordinate of the current thread
|
||||
* with respect to the core_topology.
|
||||
*
|
||||
* As long as the thread is running within the
|
||||
* As long as the thread is running within the
|
||||
* process binding the following condition holds.
|
||||
*
|
||||
* core_coordinate.first < core_topology.first
|
||||
@ -120,6 +120,10 @@ std::pair<unsigned,unsigned> get_this_thread_coordinate();
|
||||
/** \brief Bind the current thread to a core. */
|
||||
bool bind_this_thread( const std::pair<unsigned,unsigned> );
|
||||
|
||||
|
||||
/** \brief Can hwloc bind threads? */
|
||||
bool can_bind_threads();
|
||||
|
||||
/** \brief Bind the current thread to one of the cores in the list.
|
||||
* Set that entry to (~0,~0) and return the index.
|
||||
* If binding fails return ~0.
|
||||
|
||||
@ -4,14 +4,14 @@ PREFIX ?= /usr/local/lib/kokkos
|
||||
|
||||
default: messages build-lib
|
||||
echo "End Build"
|
||||
|
||||
|
||||
|
||||
include $(KOKKOS_PATH)/Makefile.kokkos
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
CXX = nvcc_wrapper
|
||||
CXX = $(NVCC_WRAPPER)
|
||||
CXXFLAGS ?= -O3
|
||||
LINK = nvcc_wrapper
|
||||
LINK = $(NVCC_WRAPPER)
|
||||
LINKFLAGS ?=
|
||||
else
|
||||
CXX ?= g++
|
||||
@ -62,8 +62,10 @@ build-makefile-kokkos:
|
||||
echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
|
||||
echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
|
||||
echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos
|
||||
echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
|
||||
echo "CXX ?= $(CXX)" >> Makefile.kokkos
|
||||
echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
|
||||
echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
|
||||
@ -90,6 +92,7 @@ build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
mkdir:
|
||||
mkdir -p $(PREFIX)
|
||||
mkdir -p $(PREFIX)/bin
|
||||
mkdir -p $(PREFIX)/include
|
||||
mkdir -p $(PREFIX)/lib
|
||||
mkdir -p $(PREFIX)/include/impl
|
||||
@ -97,7 +100,7 @@ mkdir:
|
||||
copy-cuda: mkdir
|
||||
mkdir -p $(PREFIX)/include/Cuda
|
||||
cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
|
||||
|
||||
|
||||
copy-threads: mkdir
|
||||
mkdir -p $(PREFIX)/include/Threads
|
||||
cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
|
||||
@ -111,13 +114,14 @@ copy-openmp: mkdir
|
||||
cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
|
||||
|
||||
install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
cp $(NVCC_WRAPPER) $(PREFIX)/bin
|
||||
cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
|
||||
cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
|
||||
cp Makefile.kokkos $(PREFIX)
|
||||
cp libkokkos.a $(PREFIX)/lib
|
||||
cp KokkosCore_config.h $(PREFIX)/include
|
||||
|
||||
|
||||
|
||||
|
||||
clean: kokkos-clean
|
||||
rm Makefile.kokkos
|
||||
|
||||
@ -57,41 +57,57 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
functor( typename PType::work_tag() , iwork );
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
@ -99,10 +115,20 @@ public:
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
|
||||
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -115,90 +141,119 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
functor( typename PType::work_tag() , iwork , update );
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( m_functor );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( typename Impl::enable_if<
|
||||
( Impl::is_view< ViewType >::value &&
|
||||
Impl::is_same< typename ViewType::memory_space , HostSpace >::value
|
||||
), const FunctorType & >::type functor
|
||||
, const Policy & policy
|
||||
, const ViewType & result_view )
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result_view )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
|
||||
|
||||
driver( functor
|
||||
, ValueInit::init( functor , exec.scratch_reduce() )
|
||||
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
|
||||
);
|
||||
static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
|
||||
|
||||
if ( result_view.ptr_on_device() ) {
|
||||
const int n = ValueTraits::value_count( functor );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -211,106 +266,129 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType , WorkTag > ValueOps ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range
|
||||
, const bool final )
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( iwork , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range
|
||||
, const bool final )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type work_end = range.end();
|
||||
for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
|
||||
functor( typename PType::work_tag() , iwork , update , final );
|
||||
const TagType t{} ;
|
||||
#ifdef KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION
|
||||
#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
#endif
|
||||
for ( Member iwork = ibeg ; iwork < iend ; ++iwork ) {
|
||||
functor( t , iwork , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr =
|
||||
pointer_type( exec.scratch_reduce() ) +
|
||||
ValueTraits::value_count( m_functor );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , ptr ) , false );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
const unsigned thread_count = OpenMPexec::pool_size();
|
||||
const unsigned value_count = ValueTraits::value_count( m_functor );
|
||||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
|
||||
|
||||
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
|
||||
|
||||
if ( ptr_prev ) {
|
||||
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
|
||||
ValueJoin::join( m_functor , ptr + value_count , ptr );
|
||||
}
|
||||
else {
|
||||
ValueInit::init( m_functor , ptr );
|
||||
}
|
||||
|
||||
ptr_prev = ptr ;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueOps::reference( ptr ) , true );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline
|
||||
ParallelScan( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
|
||||
driver( functor
|
||||
, ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
|
||||
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
|
||||
, false );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
const unsigned thread_count = OpenMPexec::pool_size();
|
||||
const unsigned value_count = ValueTraits::value_count( functor );
|
||||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
|
||||
|
||||
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
|
||||
|
||||
if ( ptr_prev ) {
|
||||
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
|
||||
ValueJoin::join( functor , ptr + value_count , ptr );
|
||||
}
|
||||
else {
|
||||
ValueInit::init( functor , ptr );
|
||||
}
|
||||
|
||||
ptr_prev = ptr ;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
|
||||
driver( functor
|
||||
, ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
|
||||
, typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
|
||||
, true );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
}
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
};
|
||||
@ -325,62 +403,84 @@ namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member )
|
||||
{ functor( member ); }
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( member );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename Policy::member_type & member )
|
||||
{ functor( TagType() , member ); }
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( t , member );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & functor ,
|
||||
const Policy & policy )
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
|
||||
OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
|
||||
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
|
||||
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
ParallelFor::template driver< typename Policy::work_tag >( functor , member );
|
||||
{
|
||||
ParallelFor::template exec_team< WorkTag >
|
||||
( m_functor
|
||||
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) );
|
||||
}
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
}
|
||||
|
||||
void wait() {}
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
|
||||
@ -388,102 +488,85 @@ private:
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename PType::member_type & member
|
||||
, reference_type update )
|
||||
{ functor( member , update ); }
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( member , update );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
|
||||
const FunctorType & >::type functor
|
||||
, const typename PType::member_type & member
|
||||
, reference_type update )
|
||||
{ functor( typename PType::work_tag() , member , update ); }
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( t , member , update );
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
inline
|
||||
ParallelReduce( const FunctorType & functor ,
|
||||
const Policy & policy )
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
|
||||
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
|
||||
|
||||
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
|
||||
ParallelReduce::template driver< Policy >( functor , member , update );
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( m_functor
|
||||
, Member( exec , m_policy , m_shmem_size )
|
||||
, ValueInit::init( m_functor , exec.scratch_reduce() ) );
|
||||
}
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type > Join ;
|
||||
{
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
int max_active_threads = OpenMPexec::pool_size();
|
||||
if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
|
||||
max_active_threads = m_policy.league_size()* m_policy.team_size();
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
for ( int i = 1 ; i < max_active_threads ; ++i ) {
|
||||
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( m_functor );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
|
||||
}
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
ParallelReduce( const FunctorType & functor ,
|
||||
const Policy & policy ,
|
||||
const ViewType & result )
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
const size_t team_shmem_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
|
||||
reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
|
||||
|
||||
for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
|
||||
ParallelReduce::template driver< Policy >( functor , member , update );
|
||||
}
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
|
||||
|
||||
const int n = ValueTraits::value_count( functor );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
|
||||
void wait() {}
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -84,8 +84,16 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
OpenMPexec::Pool OpenMPexec::m_pool;
|
||||
|
||||
#else
|
||||
|
||||
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
#endif
|
||||
|
||||
void OpenMPexec::verify_is_process( const char * const label )
|
||||
{
|
||||
if ( omp_in_parallel() ) {
|
||||
@ -102,6 +110,13 @@ void OpenMPexec::verify_initialized( const char * const label )
|
||||
msg.append( " ERROR: not initialized" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
if ( omp_get_max_threads() != Kokkos::OpenMP::thread_pool_size(0) ) {
|
||||
std::string msg( label );
|
||||
msg.append( " ERROR: Initialized but threads modified inappropriately" );
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void OpenMPexec::clear_scratch()
|
||||
@ -109,7 +124,16 @@ void OpenMPexec::clear_scratch()
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
if ( m_pool[ rank_rev ] ) {
|
||||
Record * const r = Record::get_record( m_pool[ rank_rev ] );
|
||||
m_pool[ rank_rev ] = 0 ;
|
||||
Record::decrement( r );
|
||||
}
|
||||
#else
|
||||
m_pool.at(rank_rev).clear();
|
||||
#endif
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
@ -147,7 +171,27 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
|
||||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
const int rank = pool_size - ( rank_rev + 1 );
|
||||
|
||||
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace()
|
||||
, "openmp_scratch"
|
||||
, alloc_size );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
|
||||
|
||||
#else
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
@ -248,7 +292,9 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
// Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
|
||||
|
||||
const unsigned omp_rank = omp_get_thread_num();
|
||||
const unsigned thread_r = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
|
||||
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
|
||||
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
|
||||
: omp_rank ;
|
||||
|
||||
Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
|
||||
}
|
||||
@ -293,7 +339,7 @@ void OpenMP::finalize()
|
||||
|
||||
omp_set_num_threads(1);
|
||||
|
||||
if ( Impl::s_using_hwloc ) {
|
||||
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
|
||||
hwloc::unbind_this_thread();
|
||||
}
|
||||
}
|
||||
|
||||
@ -61,6 +61,8 @@ public:
|
||||
|
||||
enum { MAX_THREAD_COUNT = 4096 };
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
struct Pool
|
||||
{
|
||||
Pool() : m_trackers() {}
|
||||
@ -78,11 +80,21 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
private:
|
||||
|
||||
static Pool m_pool; // Indexed by: m_pool_rank_rev
|
||||
|
||||
#else
|
||||
|
||||
private:
|
||||
|
||||
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
|
||||
|
||||
#endif
|
||||
|
||||
static int m_pool_topo[ 4 ];
|
||||
static int m_map_rank[ MAX_THREAD_COUNT ];
|
||||
static Pool m_pool; // Indexed by: m_pool_rank_rev
|
||||
|
||||
friend class Kokkos::OpenMP ;
|
||||
|
||||
@ -193,12 +205,14 @@ private:
|
||||
inline
|
||||
bool team_fan_in() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
|
||||
}
|
||||
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec.state_set( Rendezvous );
|
||||
memory_fence();
|
||||
m_exec.state_wait( Rendezvous );
|
||||
}
|
||||
|
||||
@ -208,8 +222,10 @@ private:
|
||||
inline
|
||||
void team_fan_out() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
|
||||
memory_fence();
|
||||
}
|
||||
}
|
||||
|
||||
@ -265,6 +281,7 @@ public:
|
||||
{ return ValueType(); }
|
||||
#else
|
||||
{
|
||||
memory_fence();
|
||||
typedef ValueType value_type;
|
||||
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
|
||||
#endif
|
||||
@ -301,6 +318,7 @@ public:
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
|
||||
}
|
||||
memory_fence();
|
||||
|
||||
// The base team member may "lap" the other team members,
|
||||
// copy to their local value before proceeding.
|
||||
@ -484,6 +502,8 @@ private:
|
||||
int m_team_alloc ;
|
||||
int m_team_iter ;
|
||||
|
||||
size_t m_scratch_size;
|
||||
|
||||
inline void init( const int league_size_request
|
||||
, const int team_size_request )
|
||||
{
|
||||
@ -511,13 +531,49 @@ public:
|
||||
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_scratch_size ; }
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
|
||||
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_scratch_size ( 0 )
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
|
||||
{ init( league_size_request , team_size_request ); (void) vector_length_request; }
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1)
|
||||
: m_scratch_size ( 0 )
|
||||
{ init( league_size_request , execution_space::thread_pool_size(2) ); }
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_scratch_size ( 0 )
|
||||
{ init( league_size_request , team_size_request ); }
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_scratch_size ( 0 )
|
||||
{ init( league_size_request , execution_space::thread_pool_size(2) ); }
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, int team_size_request
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_scratch_size(scratch_request.total(team_size_request))
|
||||
{ init(league_size_request,team_size_request); }
|
||||
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_scratch_size(scratch_request.total(execution_space::thread_pool_size(2)))
|
||||
{ init(league_size_request,execution_space::thread_pool_size(2)); }
|
||||
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int team_iter() const { return m_team_iter ; }
|
||||
|
||||
@ -212,7 +212,7 @@ public:
|
||||
|
||||
// Join from lower ranking to higher ranking worker.
|
||||
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
|
||||
for ( int i = m_worker_size - 1 ; --i ; ) {
|
||||
for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
|
||||
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
|
||||
}
|
||||
}
|
||||
|
||||
@ -61,47 +61,50 @@ namespace Impl {
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor , const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i );
|
||||
const TagType t{} ;
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i );
|
||||
}
|
||||
}
|
||||
|
||||
// Function is called once by every concurrent thread.
|
||||
static void execute( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
|
||||
const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
|
||||
|
||||
ParallelFor::template exec_range< WorkTag > ( self.m_functor , range.begin() , range.end() );
|
||||
|
||||
// All threads wait for completion.
|
||||
exec.exec_all_barrier();
|
||||
@ -109,95 +112,110 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
ParallelFor( const FunctorType & functor
|
||||
, const Policy & policy
|
||||
)
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
|
||||
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Qthread >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i , update );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update );
|
||||
const TagType t{} ;
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i , update );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
|
||||
driver( self.m_func
|
||||
, ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
|
||||
, typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
|
||||
);
|
||||
const WorkRange range( self.m_policy, exec.worker_rank(), exec.worker_size() );
|
||||
|
||||
exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
|
||||
ParallelReduce::template exec_range< WorkTag >(
|
||||
self.m_functor, range.begin(), range.end(),
|
||||
ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
|
||||
|
||||
exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & functor
|
||||
, const Policy & policy
|
||||
, const HostViewType & result_view )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
|
||||
|
||||
if ( result_view.ptr_on_device() ) {
|
||||
const unsigned n = ValueTraits::value_count( m_func );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
|
||||
if ( m_result_ptr ) {
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const HostViewType & arg_result_view )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -208,50 +226,63 @@ class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
|
||||
private:
|
||||
|
||||
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const Policy m_team ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member ) const
|
||||
{ m_func( member ); }
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
while ( member ) {
|
||||
functor( member );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member ) const
|
||||
{ m_func( TagType() , member ); }
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
const TagType t{} ;
|
||||
while ( member ) {
|
||||
functor( t , member );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
typename Policy::member_type member( exec , self.m_team );
|
||||
|
||||
while ( member ) {
|
||||
self.ParallelFor::template driver< typename Policy::work_tag >( member );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
ParallelFor::template exec_team< WorkTag >
|
||||
( self.m_functor , Member( exec , self.m_policy ) );
|
||||
|
||||
exec.exec_all_barrier();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelFor( const FunctorType & functor ,
|
||||
const Policy & policy )
|
||||
: m_func( functor )
|
||||
, m_team( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch
|
||||
( /* reduction memory */ 0
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -263,148 +294,170 @@ private:
|
||||
|
||||
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > Policy ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const Policy m_team ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member
|
||||
, reference_type update ) const
|
||||
{ m_func( member , update ); }
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
while ( member ) {
|
||||
functor( member , update );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member
|
||||
, reference_type update ) const
|
||||
{ m_func( TagType() , member , update ); }
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
while ( member ) {
|
||||
functor( t , member , update );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
|
||||
// Initialize thread-local value
|
||||
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( self.m_functor
|
||||
, Member( exec , self.m_policy )
|
||||
, ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) );
|
||||
|
||||
typename Policy::member_type member( exec , self.m_team );
|
||||
|
||||
while ( member ) {
|
||||
self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
|
||||
member.team_barrier();
|
||||
member.next_team();
|
||||
}
|
||||
|
||||
exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
|
||||
exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & functor ,
|
||||
const Policy & policy ,
|
||||
const ViewType & result )
|
||||
: m_func( functor )
|
||||
, m_team( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch
|
||||
( /* reduction memory */ ValueTraits::value_size( functor )
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
|
||||
( /* reduction memory */ ValueTraits::value_size( m_functor )
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
|
||||
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data );
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_func );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
|
||||
if ( m_result_ptr ) {
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const ViewType & arg_result )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > Policy ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , typename Policy::work_tag > ValueInit ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const bool final
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const bool final
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update , final );
|
||||
const TagType t{} ;
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelScan & self = * ((const ParallelScan *) arg );
|
||||
|
||||
const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
|
||||
const WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
|
||||
|
||||
// Initialize thread-local value
|
||||
reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
|
||||
reference_type update = ValueInit::init( self.m_functor , exec.exec_all_reduce_value() );
|
||||
|
||||
driver( self.m_func , update , false , range );
|
||||
ParallelScan::template exec_range< WorkTag >( self.m_functor, range.begin() , range.end() , update , false );
|
||||
|
||||
exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
|
||||
exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_functor );
|
||||
|
||||
driver( self.m_func , update , true , range );
|
||||
ParallelScan::template exec_range< WorkTag >( self.m_functor , range.begin() , range.end() , update , true );
|
||||
|
||||
exec.exec_all_barrier();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelScan( const FunctorType & functor
|
||||
, const Policy & policy
|
||||
)
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
|
||||
}
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
)
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@ -255,6 +255,56 @@ void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void Task::closeout()
|
||||
{
|
||||
enum { RESPAWN = int( Kokkos::Experimental::TASK_STATE_WAITING ) |
|
||||
int( Kokkos::Experimental::TASK_STATE_EXECUTING ) };
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "worker(%d.%d) task 0x%.12lx %s\n"
|
||||
, qthread_shep()
|
||||
, qthread_worker_local(NULL)
|
||||
, reinterpret_cast<unsigned long>(this)
|
||||
, ( m_state == RESPAWN ? "respawn" : "complete" )
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
// When dependent tasks run there would be a race
|
||||
// condition between destroying this task and
|
||||
// querying the active count pointer from this task.
|
||||
int volatile * const active_count = m_active_count ;
|
||||
|
||||
if ( m_state == RESPAWN ) {
|
||||
// Task requests respawn, set state to waiting and reschedule the task
|
||||
m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
|
||||
schedule();
|
||||
}
|
||||
else {
|
||||
|
||||
// Task did not respawn, is complete
|
||||
m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
|
||||
|
||||
// Release dependences before allowing dependent tasks to run.
|
||||
// Otherwise there is a thread race condition for removing dependences.
|
||||
for ( int i = 0 ; i < m_dep_size ; ++i ) {
|
||||
assign( & m_dep[i] , 0 );
|
||||
}
|
||||
|
||||
// Set qthread FEB to full so that dependent tasks are allowed to execute.
|
||||
// This 'task' may be deleted immediately following this function call.
|
||||
qthread_fill( & m_qfeb );
|
||||
|
||||
// The dependent task could now complete and destroy 'this' task
|
||||
// before the call to 'qthread_fill' returns. Therefore, for
|
||||
// thread safety assume that 'this' task has now been destroyed.
|
||||
}
|
||||
|
||||
// Decrement active task count before returning.
|
||||
Kokkos::atomic_decrement( active_count );
|
||||
}
|
||||
|
||||
aligned_t Task::qthread_func( void * arg )
|
||||
{
|
||||
Task * const task = reinterpret_cast< Task * >(arg);
|
||||
@ -291,62 +341,18 @@ fflush(stdout);
|
||||
#endif
|
||||
|
||||
member.team_barrier();
|
||||
|
||||
close_out = member.team_rank() == 0 ;
|
||||
if ( member.team_rank() == 0 ) task->closeout();
|
||||
member.team_barrier();
|
||||
}
|
||||
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) {
|
||||
// Team hard-wired to one, no cloning
|
||||
Kokkos::Impl::QthreadTeamPolicyMember member ;
|
||||
(*task->m_apply_team)( task , member );
|
||||
close_out = true ;
|
||||
task->closeout();
|
||||
}
|
||||
else {
|
||||
(*task->m_apply_single)( task );
|
||||
|
||||
close_out = true ;
|
||||
}
|
||||
|
||||
if ( close_out ) {
|
||||
|
||||
// When dependent tasks run there would be a race
|
||||
// condition between destroying this task and
|
||||
// querying the active count pointer from this task.
|
||||
int volatile * active_count = task->m_active_count ;
|
||||
|
||||
if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) {
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "worker(%d.%d) task 0x%.12lx respawn\n"
|
||||
, qthread_shep()
|
||||
, qthread_worker_local(NULL)
|
||||
, reinterpret_cast<unsigned long>(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
// Task respawned, set state to waiting and reschedule the task
|
||||
task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
|
||||
task->schedule();
|
||||
}
|
||||
else {
|
||||
|
||||
// Task did not respawn, is complete
|
||||
task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
|
||||
|
||||
// Release dependences before allowing dependent tasks to run.
|
||||
// Otherwise there is a thread race condition for removing dependences.
|
||||
for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
|
||||
assign( & task->m_dep[i] , 0 );
|
||||
}
|
||||
|
||||
// Set qthread FEB to full so that dependent tasks are allowed to execute.
|
||||
// This 'task' may be deleted immediately following this function call.
|
||||
qthread_fill( & task->m_qfeb );
|
||||
}
|
||||
|
||||
// Decrement active task count before returning.
|
||||
Kokkos::atomic_decrement( active_count );
|
||||
task->closeout();
|
||||
}
|
||||
|
||||
#if 0
|
||||
@ -419,8 +425,7 @@ fflush(stdout);
|
||||
, NULL
|
||||
, m_dep_size , qprecon /* dependences */
|
||||
, spawn_shepherd
|
||||
// , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
|
||||
, unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY )
|
||||
, unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
|
||||
, num_worker_per_shepherd - 1
|
||||
);
|
||||
}
|
||||
|
||||
@ -121,6 +121,7 @@ private:
|
||||
}
|
||||
|
||||
void schedule();
|
||||
void closeout();
|
||||
|
||||
protected :
|
||||
|
||||
@ -490,7 +491,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicy( const TaskPolicy & rhs )
|
||||
: m_default_dependence_capacity( rhs.m_default_dependence_capacity )
|
||||
, m_team_size( m_team_size )
|
||||
, m_team_size( rhs.m_team_size )
|
||||
, m_active_count_root(0)
|
||||
, m_active_count( rhs.m_active_count )
|
||||
{}
|
||||
@ -499,7 +500,7 @@ public:
|
||||
TaskPolicy( const TaskPolicy & rhs
|
||||
, const unsigned arg_default_dependence_capacity )
|
||||
: m_default_dependence_capacity( arg_default_dependence_capacity )
|
||||
, m_team_size( m_team_size )
|
||||
, m_team_size( rhs.m_team_size )
|
||||
, m_active_count_root(0)
|
||||
, m_active_count( rhs.m_active_count )
|
||||
{}
|
||||
|
||||
@ -1,13 +1,13 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
@ -36,7 +36,7 @@
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
@ -50,9 +50,7 @@
|
||||
#include <utility>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <Kokkos_Threads.hpp>
|
||||
#include <Kokkos_hwloc.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
|
||||
@ -135,7 +133,11 @@ void ThreadsExec::driver(void)
|
||||
|
||||
ThreadsExec::ThreadsExec()
|
||||
: m_pool_base(0)
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
, m_scratch()
|
||||
#else
|
||||
, m_scratch(0)
|
||||
#endif
|
||||
, m_scratch_reduce_end(0)
|
||||
, m_scratch_thread_end(0)
|
||||
, m_numa_rank(0)
|
||||
@ -194,8 +196,25 @@ ThreadsExec::~ThreadsExec()
|
||||
{
|
||||
const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
|
||||
|
||||
m_pool_base = 0 ;
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
if ( m_scratch ) {
|
||||
Record * const r = Record::get_record( m_scratch );
|
||||
|
||||
m_scratch = 0 ;
|
||||
|
||||
Record::decrement( r );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
m_scratch.clear();
|
||||
|
||||
#endif
|
||||
|
||||
m_pool_base = 0 ;
|
||||
m_scratch_reduce_end = 0 ;
|
||||
m_scratch_thread_end = 0 ;
|
||||
m_numa_rank = 0 ;
|
||||
@ -303,6 +322,10 @@ void ThreadsExec::fence()
|
||||
|
||||
s_current_function = 0 ;
|
||||
s_current_function_arg = 0 ;
|
||||
|
||||
// Make sure function and arguments are cleared before
|
||||
// potentially re-activating threads with a subsequent launch.
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
/** \brief Begin execution of the asynchronous functor */
|
||||
@ -317,6 +340,9 @@ void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const vo
|
||||
s_current_function = func ;
|
||||
s_current_function_arg = arg ;
|
||||
|
||||
// Make sure function and arguments are written before activating threads.
|
||||
memory_fence();
|
||||
|
||||
// Activate threads:
|
||||
for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
|
||||
s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
|
||||
@ -376,6 +402,9 @@ void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
|
||||
s_current_function = func ;
|
||||
s_current_function_arg = & s_threads_process ;
|
||||
|
||||
// Make sure function and arguments are written before activating threads.
|
||||
memory_fence();
|
||||
|
||||
const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
|
||||
|
||||
for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
|
||||
@ -394,6 +423,9 @@ void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
|
||||
|
||||
s_current_function_arg = 0 ;
|
||||
s_current_function = 0 ;
|
||||
|
||||
// Make sure function and arguments are cleared before proceeding.
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -405,17 +437,51 @@ void * ThreadsExec::root_reduce_scratch()
|
||||
|
||||
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
|
||||
{
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
|
||||
if ( exec.m_scratch ) {
|
||||
Record * const r = Record::get_record( exec.m_scratch );
|
||||
|
||||
exec.m_scratch = 0 ;
|
||||
|
||||
Record::decrement( r );
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
exec.m_scratch.clear();
|
||||
|
||||
#endif
|
||||
|
||||
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
|
||||
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
|
||||
|
||||
if ( s_threads_process.m_scratch_thread_end ) {
|
||||
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
// Allocate tracked memory:
|
||||
{
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
|
||||
|
||||
Record::increment( r );
|
||||
|
||||
exec.m_scratch = r->data();
|
||||
}
|
||||
|
||||
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
|
||||
|
||||
#else
|
||||
|
||||
exec.m_scratch =
|
||||
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
|
||||
|
||||
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
|
||||
|
||||
#endif
|
||||
|
||||
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
|
||||
|
||||
// touch on this thread
|
||||
@ -452,7 +518,11 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
|
||||
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
return s_threads_process.m_scratch ;
|
||||
#else
|
||||
return s_threads_process.m_scratch.alloc_ptr() ;
|
||||
#endif
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -550,7 +620,8 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
||||
// then they will be given default values based upon hwloc detection
|
||||
// and allowed asynchronous execution.
|
||||
|
||||
const bool hwloc_avail = hwloc::available();
|
||||
const bool hwloc_avail = Kokkos::hwloc::available();
|
||||
const bool hwloc_can_bind = hwloc_avail && Kokkos::hwloc::can_bind_threads();
|
||||
|
||||
if ( thread_count == 0 ) {
|
||||
thread_count = hwloc_avail
|
||||
@ -588,7 +659,11 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
||||
// If hwloc available then spawned thread will
|
||||
// choose its own entry in 's_threads_coord'
|
||||
// otherwise specify the entry.
|
||||
s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
|
||||
s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_can_bind ? ~0u : ith );
|
||||
|
||||
// Make sure all outstanding memory writes are complete
|
||||
// before spawning the new thread.
|
||||
memory_fence();
|
||||
|
||||
// Spawn thread executing the 'driver()' function.
|
||||
// Wait until spawned thread has attempted to initialize.
|
||||
@ -617,9 +692,13 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
||||
s_current_function_arg = 0 ;
|
||||
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
if ( ! thread_spawn_failed ) {
|
||||
// Bind process to the core on which it was located before spawning occured
|
||||
Kokkos::hwloc::bind_this_thread( proc_coord );
|
||||
if (hwloc_can_bind) {
|
||||
Kokkos::hwloc::bind_this_thread( proc_coord );
|
||||
}
|
||||
|
||||
if ( thread_spawn_begin ) { // Include process in pool.
|
||||
const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
|
||||
@ -702,7 +781,9 @@ void ThreadsExec::finalize()
|
||||
s_threads_exec[0] = 0 ;
|
||||
}
|
||||
|
||||
Kokkos::hwloc::unbind_this_thread();
|
||||
if (Kokkos::hwloc::can_bind_threads() ) {
|
||||
Kokkos::hwloc::unbind_this_thread();
|
||||
}
|
||||
|
||||
s_thread_pool_size[0] = 0 ;
|
||||
s_thread_pool_size[1] = 0 ;
|
||||
|
||||
@ -89,7 +89,11 @@ private:
|
||||
|
||||
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
Impl::AllocationTracker m_scratch ;
|
||||
#else
|
||||
void * m_scratch ;
|
||||
#endif
|
||||
int m_scratch_reduce_end ;
|
||||
int m_scratch_thread_end ;
|
||||
int m_numa_rank ;
|
||||
@ -122,9 +126,19 @@ public:
|
||||
static int get_thread_count();
|
||||
static ThreadsExec * get_thread( const int init_thread_rank );
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
|
||||
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
|
||||
|
||||
#else
|
||||
|
||||
inline void * reduce_memory() const { return m_scratch ; }
|
||||
KOKKOS_INLINE_FUNCTION void * scratch_memory() const
|
||||
{ return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
|
||||
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
|
||||
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }
|
||||
|
||||
|
||||
@ -155,6 +155,7 @@ void ThreadsExec::wait_yield( volatile int & flag , const int value )
|
||||
#elif defined( KOKKOS_HAVE_WINTHREAD )
|
||||
|
||||
/* Windows libraries */
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#include <process.h>
|
||||
|
||||
|
||||
@ -423,6 +423,8 @@ private:
|
||||
int m_team_size ;
|
||||
int m_team_alloc ;
|
||||
|
||||
size_t m_scratch_size;
|
||||
|
||||
inline
|
||||
void init( const int league_size_request
|
||||
, const int team_size_request )
|
||||
@ -477,19 +479,68 @@ public:
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int team_alloc() const { return m_team_alloc ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
inline size_t scratch_size() const { return m_scratch_size ; }
|
||||
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, int team_size_request
|
||||
, int vector_length_request = 1 )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_scratch_size ( 0 )
|
||||
{ init(league_size_request,team_size_request); (void) vector_length_request; }
|
||||
|
||||
TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
|
||||
/** \brief Specify league size, request team size */
|
||||
TeamPolicy( execution_space &
|
||||
, int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
{ init(league_size_request,team_size_request); (void) vector_length_request; }
|
||||
, m_scratch_size ( 0 )
|
||||
{ init(league_size_request,execution_space::thread_pool_size(2)); }
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, int team_size_request
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_scratch_size ( 0 )
|
||||
{ init(league_size_request,team_size_request); }
|
||||
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, int /* vector_length_request */ = 1 )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_scratch_size ( 0 )
|
||||
{ init(league_size_request,execution_space::thread_pool_size(2)); }
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, int team_size_request
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_scratch_size(scratch_request.total(team_size_request))
|
||||
{ init(league_size_request,team_size_request); }
|
||||
|
||||
|
||||
template<class MemorySpace>
|
||||
TeamPolicy( int league_size_request
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, const Experimental::TeamScratchRequest<MemorySpace> & scratch_request )
|
||||
: m_league_size(0)
|
||||
, m_team_size(0)
|
||||
, m_team_alloc(0)
|
||||
, m_scratch_size(scratch_request.total(execution_space::thread_pool_size(2)))
|
||||
{ init(league_size_request,execution_space::thread_pool_size(2)); }
|
||||
|
||||
typedef Impl::ThreadsExecTeamMember member_type ;
|
||||
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#define KOKKOS_THREADS_PARALLEL_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
@ -58,363 +59,440 @@ namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelFor Kokkos::Threads with RangePolicy */
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member ibeg , const Member iend )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i );
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( ThreadsExec & exec , const void * arg )
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
|
||||
WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
|
||||
|
||||
ParallelFor::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin() , range.end() );
|
||||
|
||||
exec.fan_in();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelFor( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::start( & ParallelFor::execute , this );
|
||||
|
||||
ThreadsExec::start( & ParallelFor::exec , this );
|
||||
ThreadsExec::fence();
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelFor Kokkos::Threads with TeamPolicy */
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shared ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member ) const
|
||||
{ m_func( member ); }
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( member );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member ) const
|
||||
{ m_func( TagType() , member ); }
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( t , member );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( ThreadsExec & exec , const void * arg )
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
|
||||
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
self.ParallelFor::template driver< typename Policy::work_tag >( member );
|
||||
}
|
||||
ParallelFor::exec_team< WorkTag >
|
||||
( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) );
|
||||
|
||||
exec.fan_in();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelFor( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
|
||||
|
||||
ThreadsExec::start( & ParallelFor::execute , this );
|
||||
ThreadsExec::start( & ParallelFor::exec , this );
|
||||
|
||||
ThreadsExec::fence();
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelReduce with Kokkos::Threads and RangePolicy */
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag work_tag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1, Arg2, Kokkos::Threads > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i , update );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update );
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i , update );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( ThreadsExec & exec , const void * arg )
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
driver( self.m_func
|
||||
, ValueInit::init( self.m_func , exec.reduce_memory() )
|
||||
, typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
|
||||
);
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
|
||||
|
||||
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
|
||||
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & functor ,
|
||||
const Policy & policy ,
|
||||
const HostViewType & result_view )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::execute , this );
|
||||
|
||||
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
ThreadsExec::start( & ParallelReduce::exec , this );
|
||||
|
||||
ThreadsExec::fence();
|
||||
|
||||
if ( result_view.ptr_on_device() ) {
|
||||
const unsigned n = ValueTraits::value_count( m_func );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
|
||||
if ( m_result_ptr ) {
|
||||
|
||||
const pointer_type data =
|
||||
(pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
ParallelReduce( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy ,
|
||||
const HostViewType & arg_result_view )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
, "Kokkos::Threads reduce result must be a View" );
|
||||
|
||||
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
|
||||
, "Kokkos::Threads reduce result must be a View in HostSpace" );
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelReduce with Kokkos::Threads and TeamPolicy */
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 >
|
||||
class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag work_tag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
|
||||
typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const pointer_type m_result_ptr ;
|
||||
const int m_shared ;
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member
|
||||
, reference_type update ) const
|
||||
{ m_func( member , update ); }
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( member , update );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
|
||||
const typename Policy::member_type & >::type member
|
||||
, reference_type update ) const
|
||||
{ m_func( TagType() , member , update ); }
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
functor( t , member , update );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( ThreadsExec & exec , const void * arg )
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
|
||||
// Initialize thread-local value
|
||||
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
|
||||
, ValueInit::init( self.m_functor , exec.reduce_memory() ) );
|
||||
|
||||
typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
|
||||
for ( ; member.valid() ; member.next() ) {
|
||||
self.ParallelReduce::template driver< work_tag >( member , update );
|
||||
}
|
||||
|
||||
exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
|
||||
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor );
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelReduce( const FunctorType & functor
|
||||
, const Policy & policy )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::execute , this );
|
||||
ThreadsExec::start( & ParallelReduce::exec , this );
|
||||
|
||||
ThreadsExec::fence();
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
|
||||
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_functor );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
|
||||
}
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
ParallelReduce( const FunctorType & functor
|
||||
, const Policy & policy
|
||||
, const ViewType & result )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
, m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
|
||||
{
|
||||
ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
|
||||
|
||||
ThreadsExec::start( & ParallelReduce::execute , this );
|
||||
|
||||
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
|
||||
|
||||
ThreadsExec::fence();
|
||||
|
||||
const unsigned n = ValueTraits::value_count( m_func );
|
||||
for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
|
||||
}
|
||||
ParallelReduce( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy
|
||||
, const ViewType & arg_result )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
/* ParallelScan with Kokkos::Threads and RangePolicy */
|
||||
|
||||
template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
|
||||
class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads >
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::work_tag work_tag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , work_tag > ValueInit ;
|
||||
typedef Kokkos::RangePolicy< Arg0, Arg1, Arg2, Kokkos::Threads > Policy ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_func ;
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const bool final
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( i , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
template< class PType >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
void driver( typename Impl::enable_if<
|
||||
( ! Impl::is_same< typename PType::work_tag , void >::value )
|
||||
, const FunctorType & >::type functor
|
||||
, reference_type update
|
||||
, const bool final
|
||||
, const PType & range )
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_range( const FunctorType & functor
|
||||
, const Member & ibeg , const Member & iend
|
||||
, reference_type update , const bool final )
|
||||
{
|
||||
const typename PType::member_type e = range.end();
|
||||
for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
|
||||
functor( typename PType::work_tag() , i , update , final );
|
||||
const TagType t{} ;
|
||||
#if defined( KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ) && \
|
||||
defined( KOKKOS_HAVE_PRAGMA_IVDEP )
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for ( Member i = ibeg ; i < iend ; ++i ) {
|
||||
functor( t , i , update , final );
|
||||
}
|
||||
}
|
||||
|
||||
static void execute( ThreadsExec & exec , const void * arg )
|
||||
static void exec( ThreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelScan & self = * ((const ParallelScan *) arg );
|
||||
|
||||
const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
|
||||
const WorkRange range( self.m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
|
||||
reference_type update =
|
||||
ValueInit::init( self.m_functor , exec.reduce_memory() );
|
||||
|
||||
driver( self.m_func , update , false , range );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin(), range.end(), update, false );
|
||||
|
||||
// exec.<FunctorType,work_tag>scan_large( self.m_func );
|
||||
exec.template scan_small<FunctorType,work_tag>( self.m_func );
|
||||
// exec.template scan_large<FunctorType,WorkTag>( self.m_functor );
|
||||
exec.template scan_small<FunctorType,WorkTag>( self.m_functor );
|
||||
|
||||
driver( self.m_func , update , true , range );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( self.m_functor , range.begin(), range.end(), update, true );
|
||||
|
||||
exec.fan_in();
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
ParallelScan( const FunctorType & functor , const Policy & policy )
|
||||
: m_func( functor )
|
||||
, m_policy( policy )
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
|
||||
ThreadsExec::start( & ParallelScan::execute , this );
|
||||
ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
|
||||
ThreadsExec::start( & ParallelScan::exec , this );
|
||||
ThreadsExec::fence();
|
||||
}
|
||||
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
{ }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
18
lib/kokkos/core/src/impl/CMakeLists.txt
Normal file
18
lib/kokkos/core/src/impl/CMakeLists.txt
Normal file
@ -0,0 +1,18 @@
|
||||
|
||||
SET(HEADERS "")
|
||||
SET(SOURCES "")
|
||||
|
||||
FILE(GLOB HEADERS *.hpp)
|
||||
FILE(GLOB SOURCES *.cpp)
|
||||
|
||||
TRIBITS_ADD_LIBRARY(
|
||||
kokkoscore_impl
|
||||
NOINSTALLHEADERS ${HEADERS}
|
||||
SOURCES ${SOURCES}
|
||||
DEPLIBS
|
||||
)
|
||||
|
||||
SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
|
||||
|
||||
INSTALL(FILES ${HEADERS} DESTINATION ${TRILINOS_INCDIR}/impl/)
|
||||
|
||||
@ -47,6 +47,27 @@ namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
int SharedAllocationRecord< void , void >::s_tracking_enabled = 1 ;
|
||||
|
||||
void SharedAllocationRecord< void , void >::tracking_claim_and_disable()
|
||||
{
|
||||
// A host thread claim and disable tracking flag
|
||||
|
||||
while ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 1, 0 ) );
|
||||
}
|
||||
|
||||
void SharedAllocationRecord< void , void >::tracking_release_and_enable()
|
||||
{
|
||||
// The host thread that claimed and disabled the tracking flag
|
||||
// now release and enable tracking.
|
||||
|
||||
if ( ! Kokkos::atomic_compare_exchange_strong( & s_tracking_enabled, 0, 1 ) ){
|
||||
Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord<>::tracking_release_and_enable FAILED, this host process thread did not hold the lock" );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
bool
|
||||
SharedAllocationRecord< void , void >::
|
||||
is_sane( SharedAllocationRecord< void , void > * arg_record )
|
||||
@ -61,7 +82,7 @@ is_sane( SharedAllocationRecord< void , void > * arg_record )
|
||||
SharedAllocationRecord * root_next = 0 ;
|
||||
|
||||
// Lock the list:
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 );
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == zero );
|
||||
|
||||
for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
|
||||
const bool ok_non_null = rec && rec->m_prev && ( rec == root || rec->m_next );
|
||||
@ -73,14 +94,25 @@ is_sane( SharedAllocationRecord< void , void > * arg_record )
|
||||
ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
|
||||
|
||||
if ( ! ok ) {
|
||||
fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n"
|
||||
, reinterpret_cast< unsigned long >( rec )
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12llx){ m_count(%d) m_root(0x%.12llx) m_next(0x%.12llx) m_prev(0x%.12llx) m_next->m_prev(0x%.12llx) m_prev->m_next(0x%.12llx) }\n";
|
||||
}
|
||||
|
||||
fprintf(stderr
|
||||
, format_string
|
||||
, reinterpret_cast< uintptr_t >( rec )
|
||||
, rec->m_count
|
||||
, reinterpret_cast< unsigned long >( rec->m_root )
|
||||
, reinterpret_cast< unsigned long >( rec->m_next )
|
||||
, reinterpret_cast< unsigned long >( rec->m_prev )
|
||||
, reinterpret_cast< unsigned long >( rec->m_next->m_prev )
|
||||
, reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
|
||||
, reinterpret_cast< uintptr_t >( rec->m_root )
|
||||
, reinterpret_cast< uintptr_t >( rec->m_next )
|
||||
, reinterpret_cast< uintptr_t >( rec->m_prev )
|
||||
, reinterpret_cast< uintptr_t >( rec->m_next->m_prev )
|
||||
, reinterpret_cast< uintptr_t >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
|
||||
);
|
||||
}
|
||||
|
||||
@ -102,7 +134,7 @@ SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * con
|
||||
SharedAllocationRecord * root_next = 0 ;
|
||||
|
||||
// Lock the list:
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 );
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , zero ) ) == zero );
|
||||
|
||||
// Iterate searching for the record with this data pointer
|
||||
|
||||
@ -148,7 +180,7 @@ SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
|
||||
m_prev = m_root ;
|
||||
|
||||
// Read root->m_next and lock by setting to zero
|
||||
while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 );
|
||||
while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == zero );
|
||||
|
||||
m_next->m_prev = this ;
|
||||
|
||||
@ -187,7 +219,7 @@ decrement( SharedAllocationRecord< void , void > * arg_record )
|
||||
SharedAllocationRecord * root_next = 0 ;
|
||||
|
||||
// Lock the list:
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 );
|
||||
while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , zero ) ) == zero );
|
||||
|
||||
arg_record->m_next->m_prev = arg_record->m_prev ;
|
||||
|
||||
@ -232,16 +264,26 @@ print_host_accessible_records( std::ostream & s
|
||||
|
||||
if ( detail ) {
|
||||
do {
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "%s addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ 0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, space_name
|
||||
, reinterpret_cast<unsigned long>( r )
|
||||
, reinterpret_cast<unsigned long>( r->m_prev )
|
||||
, reinterpret_cast<unsigned long>( r->m_next )
|
||||
, reinterpret_cast<unsigned long>( r->m_alloc_ptr )
|
||||
, reinterpret_cast<uintptr_t>( r )
|
||||
, reinterpret_cast<uintptr_t>( r->m_prev )
|
||||
, reinterpret_cast<uintptr_t>( r->m_next )
|
||||
, reinterpret_cast<uintptr_t>( r->m_alloc_ptr )
|
||||
, r->m_alloc_size
|
||||
, r->m_count
|
||||
, reinterpret_cast<unsigned long>( r->m_dealloc )
|
||||
, reinterpret_cast<uintptr_t>( r->m_dealloc )
|
||||
, r->m_alloc_ptr->m_label
|
||||
);
|
||||
std::cout << buffer ;
|
||||
@ -251,10 +293,20 @@ print_host_accessible_records( std::ostream & s
|
||||
else {
|
||||
do {
|
||||
if ( r->m_alloc_ptr ) {
|
||||
//Formatting dependent on sizeof(uintptr_t)
|
||||
const char * format_string;
|
||||
|
||||
snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n"
|
||||
if (sizeof(uintptr_t) == sizeof(unsigned long)) {
|
||||
format_string = "%s [ 0x%.12lx + %ld ] %s\n";
|
||||
}
|
||||
else if (sizeof(uintptr_t) == sizeof(unsigned long long)) {
|
||||
format_string = "%s [ 0x%.12llx + %ld ] %s\n";
|
||||
}
|
||||
|
||||
snprintf( buffer , 256
|
||||
, format_string
|
||||
, space_name
|
||||
, reinterpret_cast< unsigned long >( r->data() )
|
||||
, reinterpret_cast< uintptr_t >( r->data() )
|
||||
, r->size()
|
||||
, r->m_alloc_ptr->m_label
|
||||
);
|
||||
|
||||
@ -41,6 +41,9 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SHARED_ALLOC_HPP_
|
||||
#define KOKKOS_SHARED_ALLOC_HPP_
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
@ -78,6 +81,8 @@ protected:
|
||||
|
||||
typedef void (* function_type )( SharedAllocationRecord<void,void> * );
|
||||
|
||||
static int s_tracking_enabled ;
|
||||
|
||||
SharedAllocationHeader * const m_alloc_ptr ;
|
||||
size_t const m_alloc_size ;
|
||||
function_type const m_dealloc ;
|
||||
@ -100,6 +105,18 @@ protected:
|
||||
|
||||
public:
|
||||
|
||||
static int tracking_enabled() { return s_tracking_enabled ; }
|
||||
|
||||
/**\brief A host process thread claims and disables the
|
||||
* shared allocation tracking flag.
|
||||
*/
|
||||
static void tracking_claim_and_disable();
|
||||
|
||||
/**\brief A host process thread releases and enables the
|
||||
* shared allocation tracking flag.
|
||||
*/
|
||||
static void tracking_release_and_enable();
|
||||
|
||||
~SharedAllocationRecord() = default ;
|
||||
|
||||
constexpr SharedAllocationRecord()
|
||||
@ -148,6 +165,25 @@ public:
|
||||
, const bool detail );
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
/* Taking the address of this function so make sure it is unique */
|
||||
template < class MemorySpace , class DestroyFunctor >
|
||||
void deallocate( SharedAllocationRecord<void,void> * record_ptr )
|
||||
{
|
||||
typedef SharedAllocationRecord< MemorySpace , void > base_type ;
|
||||
typedef SharedAllocationRecord< MemorySpace , DestroyFunctor > this_type ;
|
||||
|
||||
this_type * const ptr = static_cast< this_type * >(
|
||||
static_cast< base_type * >( record_ptr ) );
|
||||
|
||||
ptr->m_destroy.destroy_shared_allocation();
|
||||
|
||||
delete ptr ;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Memory space specialization of SharedAllocationRecord< Space , void > requires :
|
||||
*
|
||||
@ -158,25 +194,23 @@ public:
|
||||
* Space m_space ;
|
||||
* }
|
||||
*/
|
||||
|
||||
template< class MemorySpace , class DestroyFunctor >
|
||||
class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
|
||||
{
|
||||
private:
|
||||
|
||||
static void deallocate( SharedAllocationRecord<void,void> * record_ptr )
|
||||
{ delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); }
|
||||
|
||||
SharedAllocationRecord( const MemorySpace & arg_space
|
||||
, const std::string & arg_label
|
||||
, const size_t arg_alloc
|
||||
)
|
||||
/* Allocate user memory as [ SharedAllocationHeader , user_memory ] */
|
||||
: SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate )
|
||||
: SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & Kokkos::Experimental::Impl::deallocate< MemorySpace , DestroyFunctor > )
|
||||
, m_destroy()
|
||||
{}
|
||||
|
||||
~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); }
|
||||
SharedAllocationRecord() = delete ;
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
|
||||
public:
|
||||
|
||||
@ -204,42 +238,48 @@ private:
|
||||
|
||||
typedef SharedAllocationRecord<void,void> Record ;
|
||||
|
||||
enum : unsigned long {
|
||||
DO_NOT_DEREF_FLAG = 0x01ul
|
||||
};
|
||||
enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
|
||||
|
||||
// The allocation record resides in Host memory space
|
||||
Record * m_record ;
|
||||
unsigned long m_record_bits;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static Record * disable( Record * rec )
|
||||
{ return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void increment() const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void decrement() const
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
|
||||
#endif
|
||||
}
|
||||
Record * m_record ;
|
||||
uintptr_t m_record_bits ;
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {}
|
||||
// Use macros instead of inline functions to reduce
|
||||
// pressure on compiler optimization by reducing
|
||||
// number of symbols and inline functons.
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED \
|
||||
Record::tracking_enabled()
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT \
|
||||
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT \
|
||||
if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
|
||||
|
||||
#else
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED 0
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT /* */
|
||||
|
||||
#define KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT /* */
|
||||
|
||||
#endif
|
||||
|
||||
/** \brief Assign a specialized record */
|
||||
inline
|
||||
void assign_allocated_record_to_uninitialized( Record * arg_record )
|
||||
{ Record::increment( m_record = arg_record ); }
|
||||
|
||||
template< class MemorySpace >
|
||||
constexpr
|
||||
SharedAllocationRecord< MemorySpace , void > & get_record() const
|
||||
SharedAllocationRecord< MemorySpace , void > &
|
||||
get_record() const
|
||||
{ return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
|
||||
|
||||
template< class MemorySpace >
|
||||
@ -252,36 +292,92 @@ public:
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SharedAllocationTracker( Record * arg_record )
|
||||
: m_record( arg_record ) { increment(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
~SharedAllocationTracker() { decrement(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SharedAllocationTracker( const SharedAllocationTracker & rhs )
|
||||
: m_record( rhs.m_record ) { increment(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SharedAllocationTracker( SharedAllocationTracker && rhs )
|
||||
: m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
|
||||
int use_count() const
|
||||
{
|
||||
decrement();
|
||||
m_record = rhs.m_record ;
|
||||
increment();
|
||||
return *this ;
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
Record * const tmp = reinterpret_cast<Record*>( m_record_bits & ~DO_NOT_DEREF_FLAG );
|
||||
return ( tmp ? tmp->use_count() : 0 );
|
||||
#else
|
||||
return 0 ;
|
||||
#endif
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
~SharedAllocationTracker()
|
||||
{ KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
constexpr SharedAllocationTracker()
|
||||
: m_record_bits( DO_NOT_DEREF_FLAG ) {}
|
||||
|
||||
// Move:
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
SharedAllocationTracker( SharedAllocationTracker && rhs )
|
||||
: m_record_bits( rhs.m_record_bits )
|
||||
{ rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
|
||||
{
|
||||
m_record = rhs.m_record ;
|
||||
// If this is tracking then must decrement
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
|
||||
// Move and reset RHS to default constructed value.
|
||||
m_record_bits = rhs.m_record_bits ;
|
||||
rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
|
||||
return *this ;
|
||||
}
|
||||
|
||||
// Copy:
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
SharedAllocationTracker( const SharedAllocationTracker & rhs )
|
||||
: m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
|
||||
? rhs.m_record_bits
|
||||
: rhs.m_record_bits | DO_NOT_DEREF_FLAG )
|
||||
{
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
|
||||
}
|
||||
|
||||
/** \brief Copy construction may disable tracking. */
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
SharedAllocationTracker( const SharedAllocationTracker & rhs
|
||||
, const bool enable_tracking )
|
||||
: m_record_bits( KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
|
||||
&& enable_tracking
|
||||
? rhs.m_record_bits
|
||||
: rhs.m_record_bits | DO_NOT_DEREF_FLAG )
|
||||
{ KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
|
||||
{
|
||||
// If this is tracking then must decrement
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
|
||||
m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
|
||||
? rhs.m_record_bits
|
||||
: rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
|
||||
return *this ;
|
||||
}
|
||||
|
||||
/** \brief Copy assignment may disable tracking */
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
void assign( const SharedAllocationTracker & rhs
|
||||
, const bool enable_tracking )
|
||||
{
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
|
||||
m_record_bits = KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
|
||||
&& enable_tracking
|
||||
? rhs.m_record_bits
|
||||
: rhs.m_record_bits | DO_NOT_DEREF_FLAG ;
|
||||
KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
|
||||
}
|
||||
|
||||
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_ENABLED
|
||||
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_INCREMENT
|
||||
#undef KOKKOS_SHARED_ALLOCATION_TRACKER_DECREMENT
|
||||
|
||||
};
|
||||
|
||||
|
||||
@ -289,4 +385,4 @@ public:
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@ -47,6 +47,28 @@
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/* For backward compatibility */
|
||||
|
||||
struct ViewAllocateWithoutInitializing {
|
||||
|
||||
const std::string label ;
|
||||
|
||||
ViewAllocateWithoutInitializing() : label() {}
|
||||
ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
|
||||
ViewAllocateWithoutInitializing( const char * const arg_label ) : label( arg_label ) {}
|
||||
};
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
@ -50,8 +50,8 @@ namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template< class DataType , class V , long N , class P , class ArrayLayout >
|
||||
struct ViewDataAnalysis< DataType , Kokkos::Array<V,N,P> , ArrayLayout >
|
||||
template< class DataType , class ArrayLayout , class V , size_t N , class P >
|
||||
struct ViewDataAnalysis< DataType , ArrayLayout , Kokkos::Array<V,N,P> >
|
||||
{
|
||||
private:
|
||||
|
||||
@ -73,15 +73,7 @@ private:
|
||||
, typename array_analysis::const_value_type
|
||||
>::value };
|
||||
|
||||
typedef ViewDimension< ( dimension::rank == 0 ? N : dimension::arg_N0 )
|
||||
, ( dimension::rank == 1 ? N : dimension::arg_N1 )
|
||||
, ( dimension::rank == 2 ? N : dimension::arg_N2 )
|
||||
, ( dimension::rank == 3 ? N : dimension::arg_N3 )
|
||||
, ( dimension::rank == 4 ? N : dimension::arg_N4 )
|
||||
, ( dimension::rank == 5 ? N : dimension::arg_N5 )
|
||||
, ( dimension::rank == 6 ? N : dimension::arg_N6 )
|
||||
, ( dimension::rank == 7 ? N : dimension::arg_N7 )
|
||||
> array_scalar_dimension ;
|
||||
typedef typename dimension::template append<N>::type array_scalar_dimension ;
|
||||
|
||||
typedef typename std::conditional< is_const , const V , V >::type scalar_type ;
|
||||
typedef V non_const_scalar_type ;
|
||||
@ -113,18 +105,18 @@ namespace Impl {
|
||||
|
||||
/** \brief View mapping for non-specialized data type and standard layout */
|
||||
template< class Traits >
|
||||
class ViewMapping< Traits , void ,
|
||||
typename std::enable_if<( std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
|
||||
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
|
||||
)>::type >
|
||||
class ViewMapping< Traits ,
|
||||
typename std::enable_if<(
|
||||
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value &&
|
||||
( std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value )
|
||||
)>::type >
|
||||
{
|
||||
private:
|
||||
|
||||
template< class , class , typename > friend class ViewMapping ;
|
||||
template< class , bool , bool , bool , bool , bool , bool , bool , bool , class > friend struct SubviewMapping ;
|
||||
template< class , class , class , class > friend class Kokkos::Experimental::View ;
|
||||
template< class , class ... > friend class ViewMapping ;
|
||||
template< class , class ... > friend class Kokkos::Experimental::View ;
|
||||
|
||||
typedef ViewOffset< typename Traits::dimension
|
||||
, typename Traits::array_layout
|
||||
@ -187,16 +179,20 @@ public:
|
||||
// Range span
|
||||
|
||||
/** \brief Span of the mapped range */
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t span() const
|
||||
{ return m_offset.span() * Array_N ; }
|
||||
|
||||
/** \brief Is the mapped range span contiguous */
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); }
|
||||
KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
|
||||
{ return m_offset.span_is_contiguous(); }
|
||||
|
||||
typedef typename std::conditional< is_contiguous_reference , contiguous_reference , strided_reference >::type reference_type ;
|
||||
|
||||
typedef handle_type pointer_type ;
|
||||
|
||||
/** \brief If data references are lvalue_reference than can query pointer to memory */
|
||||
KOKKOS_INLINE_FUNCTION constexpr typename Traits::value_type * data() const
|
||||
{ return (typename Traits::value_type *) 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const
|
||||
{ return m_handle ; }
|
||||
|
||||
//----------------------------------------
|
||||
// The View class performs all rank and bounds checking before
|
||||
@ -259,14 +255,14 @@ public:
|
||||
private:
|
||||
|
||||
enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
|
||||
enum { MemorySpanSize = sizeof(typename Traits::value_type) };
|
||||
enum { MemorySpanSize = sizeof(scalar_type) };
|
||||
|
||||
public:
|
||||
|
||||
/** \brief Span, in bytes, of the referenced memory */
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
|
||||
{
|
||||
return ( m_stride * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
return ( m_offset.span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
}
|
||||
|
||||
/** \brief Span, in bytes, of the required memory */
|
||||
@ -277,7 +273,7 @@ public:
|
||||
, const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
|
||||
{
|
||||
typedef std::integral_constant< unsigned , AllowPadding ? MemorySpanSize : 0 > padding ;
|
||||
return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
}
|
||||
|
||||
/** \brief Span, in bytes, of the required memory */
|
||||
@ -286,7 +282,7 @@ public:
|
||||
static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> &
|
||||
, const typename Traits::array_layout & layout )
|
||||
{
|
||||
return ( offset_type( layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
return ( offset_type( layout ).span() * Array_N * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
@ -305,11 +301,11 @@ public:
|
||||
|
||||
template< bool AllowPadding >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ViewMapping( void * ptr
|
||||
ViewMapping( pointer_type ptr
|
||||
, const std::integral_constant<bool,AllowPadding> &
|
||||
, const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3
|
||||
, const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
|
||||
: m_handle( reinterpret_cast< handle_type >( ptr ) )
|
||||
: m_handle( ptr )
|
||||
, m_offset( std::integral_constant< unsigned , AllowPadding ? sizeof(typename Traits::value_type) : 0 >()
|
||||
, N0, N1, N2, N3, N4, N5, N6, N7 )
|
||||
, m_stride( m_offset.span() )
|
||||
@ -317,10 +313,10 @@ public:
|
||||
|
||||
template< bool AllowPadding >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ViewMapping( void * ptr
|
||||
ViewMapping( pointer_type ptr
|
||||
, const std::integral_constant<bool,AllowPadding> &
|
||||
, const typename Traits::array_layout & layout )
|
||||
: m_handle( reinterpret_cast< handle_type >( ptr ) )
|
||||
: m_handle( ptr )
|
||||
, m_offset( layout )
|
||||
, m_stride( m_offset.span() )
|
||||
{}
|
||||
@ -340,7 +336,8 @@ public:
|
||||
{
|
||||
typedef Kokkos::RangePolicy< ExecSpace , size_t > Policy ;
|
||||
|
||||
(void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_stride ) );
|
||||
const Kokkos::Impl::ParallelFor< ViewMapping , Policy > closure( *this , Policy( 0 , m_stride ) );
|
||||
closure.execute();
|
||||
ExecSpace::fence();
|
||||
}
|
||||
|
||||
@ -379,8 +376,8 @@ public:
|
||||
enum { is_assignable = true };
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker TrackType ;
|
||||
typedef ViewMapping< DstTraits , void , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void , void > SrcType ;
|
||||
typedef ViewMapping< DstTraits , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void > SrcType ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
|
||||
@ -438,8 +435,8 @@ public:
|
||||
std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value };
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationTracker TrackType ;
|
||||
typedef ViewMapping< DstTraits , void , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void , void > SrcType ;
|
||||
typedef ViewMapping< DstTraits , void > DstType ;
|
||||
typedef ViewMapping< SrcTraits , void > SrcType ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
|
||||
@ -452,6 +449,7 @@ public:
|
||||
// Arguments beyond the destination rank are ignored.
|
||||
if ( src.span_is_contiguous() ) { // not padded
|
||||
dst.m_offset = dst_offset_type( std::integral_constant<unsigned,0>()
|
||||
, ( 0 < SrcType::Rank ? src.dimension_0() : SrcTraits::value_type::size() )
|
||||
, ( 1 < SrcType::Rank ? src.dimension_1() : SrcTraits::value_type::size() )
|
||||
, ( 2 < SrcType::Rank ? src.dimension_2() : SrcTraits::value_type::size() )
|
||||
, ( 3 < SrcType::Rank ? src.dimension_3() : SrcTraits::value_type::size() )
|
||||
@ -483,34 +481,47 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/** \brief View mapping for non-specialized data type and standard layout */
|
||||
template< class Traits , bool R0 , bool R1 , bool R2 , bool R3 , bool R4 , bool R5 , bool R6 , bool R7 >
|
||||
struct SubviewMapping< Traits, R0, R1, R2, R3, R4, R5, R6, R7 ,
|
||||
typename std::enable_if<(
|
||||
std::is_same< typename Traits::specialize , Kokkos::Array<> >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
)>::type >
|
||||
template< class SrcTraits , class ... Args >
|
||||
struct ViewMapping
|
||||
< typename std::enable_if<(
|
||||
std::is_same< typename SrcTraits::specialize , Kokkos::Array<> >::value
|
||||
&&
|
||||
(
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
|
||||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
|
||||
)
|
||||
)>::type
|
||||
, SrcTraits
|
||||
, Args ... >
|
||||
{
|
||||
private:
|
||||
|
||||
// Subview's rank
|
||||
static_assert( SrcTraits::rank == sizeof...(Args) , "" );
|
||||
|
||||
enum : bool
|
||||
{ R0 = is_integral_extent<0,Args...>::value
|
||||
, R1 = is_integral_extent<1,Args...>::value
|
||||
, R2 = is_integral_extent<2,Args...>::value
|
||||
, R3 = is_integral_extent<3,Args...>::value
|
||||
, R4 = is_integral_extent<4,Args...>::value
|
||||
, R5 = is_integral_extent<5,Args...>::value
|
||||
, R6 = is_integral_extent<6,Args...>::value
|
||||
, R7 = is_integral_extent<7,Args...>::value
|
||||
};
|
||||
|
||||
enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
|
||||
+ unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
|
||||
|
||||
// Whether right-most rank is a range.
|
||||
enum { R0_rev = 0 == Traits::rank ? false : (
|
||||
1 == Traits::rank ? R0 : (
|
||||
2 == Traits::rank ? R1 : (
|
||||
3 == Traits::rank ? R2 : (
|
||||
4 == Traits::rank ? R3 : (
|
||||
5 == Traits::rank ? R4 : (
|
||||
6 == Traits::rank ? R5 : (
|
||||
7 == Traits::rank ? R6 : R7 ))))))) };
|
||||
enum { R0_rev = 0 == SrcTraits::rank ? false : (
|
||||
1 == SrcTraits::rank ? R0 : (
|
||||
2 == SrcTraits::rank ? R1 : (
|
||||
3 == SrcTraits::rank ? R2 : (
|
||||
4 == SrcTraits::rank ? R3 : (
|
||||
5 == SrcTraits::rank ? R4 : (
|
||||
6 == SrcTraits::rank ? R5 : (
|
||||
7 == SrcTraits::rank ? R6 : R7 ))))))) };
|
||||
|
||||
// Subview's layout
|
||||
typedef typename std::conditional<
|
||||
@ -519,15 +530,15 @@ private:
|
||||
||
|
||||
// OutputRank 1 or 2, InputLayout Left, Interval 0
|
||||
// because single stride one or second index has a stride.
|
||||
( rank <= 2 && R0 && std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value )
|
||||
( rank <= 2 && R0 && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value )
|
||||
||
|
||||
// OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
|
||||
// because single stride one or second index has a stride.
|
||||
( rank <= 2 && R0_rev && std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value )
|
||||
), typename Traits::array_layout , Kokkos::LayoutStride
|
||||
( rank <= 2 && R0_rev && std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value )
|
||||
), typename SrcTraits::array_layout , Kokkos::LayoutStride
|
||||
>::type array_layout ;
|
||||
|
||||
typedef typename Traits::value_type value_type ;
|
||||
typedef typename SrcTraits::value_type value_type ;
|
||||
|
||||
typedef typename std::conditional< rank == 0 , value_type ,
|
||||
typename std::conditional< rank == 1 , value_type * ,
|
||||
@ -543,66 +554,41 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
typedef
|
||||
Kokkos::Experimental::ViewTraits< data_type , array_layout
|
||||
, typename Traits::device_type
|
||||
, typename Traits::memory_traits > traits_type ;
|
||||
typedef Kokkos::Experimental::ViewTraits
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
, typename SrcTraits::memory_traits > traits_type ;
|
||||
|
||||
typedef Kokkos::Experimental::View< data_type
|
||||
, array_layout
|
||||
, typename Traits::device_type
|
||||
, typename Traits::memory_traits > type ;
|
||||
typedef Kokkos::Experimental::View
|
||||
< data_type
|
||||
, array_layout
|
||||
, typename SrcTraits::device_type
|
||||
, typename SrcTraits::memory_traits > type ;
|
||||
|
||||
template< class T0 , class T1 , class T2 , class T3
|
||||
, class T4 , class T5 , class T6 , class T7 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void assign( ViewMapping< traits_type , void , void > & dst
|
||||
, ViewMapping< Traits , void , void > const & src
|
||||
, T0 const & arg0
|
||||
, T1 const & arg1
|
||||
, T2 const & arg2
|
||||
, T3 const & arg3
|
||||
, T4 const & arg4
|
||||
, T5 const & arg5
|
||||
, T6 const & arg6
|
||||
, T7 const & arg7
|
||||
)
|
||||
static void assign( ViewMapping< traits_type , void > & dst
|
||||
, ViewMapping< SrcTraits , void > const & src
|
||||
, Args ... args )
|
||||
{
|
||||
typedef ViewMapping< traits_type , void , void > DstType ;
|
||||
typedef ViewMapping< traits_type , void > DstType ;
|
||||
|
||||
typedef typename DstType::offset_type dst_offset_type ;
|
||||
typedef typename DstType::handle_type dst_handle_type ;
|
||||
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T0> V0 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T1> V1 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T2> V2 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T3> V3 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T4> V4 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T5> V5 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T6> V6 ;
|
||||
typedef Kokkos::Experimental::Impl::ViewOffsetRange<T7> V7 ;
|
||||
|
||||
dst.m_offset = dst_offset_type
|
||||
( src.m_offset
|
||||
, V0::dimension( src.m_offset.dimension_0() , arg0 )
|
||||
, V1::dimension( src.m_offset.dimension_1() , arg1 )
|
||||
, V2::dimension( src.m_offset.dimension_2() , arg2 )
|
||||
, V3::dimension( src.m_offset.dimension_3() , arg3 )
|
||||
, V4::dimension( src.m_offset.dimension_4() , arg4 )
|
||||
, V5::dimension( src.m_offset.dimension_5() , arg5 )
|
||||
, V6::dimension( src.m_offset.dimension_6() , arg6 )
|
||||
, V7::dimension( src.m_offset.dimension_7() , arg7 )
|
||||
);
|
||||
const SubviewExtents< SrcTraits::rank , rank >
|
||||
extents( src.m_offset.m_dim , args... );
|
||||
|
||||
dst.m_offset = dst_offset_type( src.m_offset , extents );
|
||||
dst.m_handle = dst_handle_type( src.m_handle +
|
||||
src.m_offset( V0::begin( arg0 )
|
||||
, V1::begin( arg1 )
|
||||
, V2::begin( arg2 )
|
||||
, V3::begin( arg3 )
|
||||
, V4::begin( arg4 )
|
||||
, V5::begin( arg5 )
|
||||
, V6::begin( arg6 )
|
||||
, V7::begin( arg7 )
|
||||
src.m_offset( extents.domain_offset(0)
|
||||
, extents.domain_offset(1)
|
||||
, extents.domain_offset(2)
|
||||
, extents.domain_offset(3)
|
||||
, extents.domain_offset(4)
|
||||
, extents.domain_offset(5)
|
||||
, extents.domain_offset(6)
|
||||
, extents.domain_offset(7)
|
||||
) );
|
||||
}
|
||||
};
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -69,8 +69,8 @@ struct ViewOffset< Dimension , Layout ,
|
||||
{
|
||||
public:
|
||||
|
||||
enum { SHIFT_0 = Kokkos::Impl::power_of_two<Layout::N0>::value };
|
||||
enum { SHIFT_1 = Kokkos::Impl::power_of_two<Layout::N1>::value };
|
||||
enum { SHIFT_0 = Kokkos::Impl::integral_power_of_two(Layout::N0) };
|
||||
enum { SHIFT_1 = Kokkos::Impl::integral_power_of_two(Layout::N1) };
|
||||
enum { SHIFT_T = SHIFT_0 + SHIFT_1 };
|
||||
enum { MASK_0 = Layout::N0 - 1 };
|
||||
enum { MASK_1 = Layout::N1 - 1 };
|
||||
@ -155,6 +155,42 @@ public:
|
||||
{}
|
||||
};
|
||||
|
||||
template< typename T , unsigned N0 , unsigned N1 , class ... P
|
||||
, typename iType0 , typename iType1
|
||||
>
|
||||
struct ViewMapping
|
||||
< void
|
||||
, Kokkos::Experimental::ViewTraits<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...>
|
||||
, Kokkos::LayoutTileLeft<N0,N1,true>
|
||||
, iType0
|
||||
, iType1 >
|
||||
{
|
||||
typedef Kokkos::LayoutTileLeft<N0,N1,true> src_layout ;
|
||||
typedef Kokkos::Experimental::ViewTraits< T** , src_layout , P... > src_traits ;
|
||||
typedef Kokkos::Experimental::ViewTraits< T[N0][N1] , LayoutLeft , P ... > traits ;
|
||||
typedef Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P ... > type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( ViewMapping< traits , void > & dst
|
||||
, const ViewMapping< src_traits , void > & src
|
||||
, const src_layout &
|
||||
, const size_t i_tile0
|
||||
, const size_t i_tile1
|
||||
)
|
||||
{
|
||||
typedef ViewMapping< traits , void > dst_map_type ;
|
||||
typedef ViewMapping< src_traits , void > src_map_type ;
|
||||
typedef typename dst_map_type::handle_type dst_handle_type ;
|
||||
typedef typename dst_map_type::offset_type dst_offset_type ;
|
||||
typedef typename src_map_type::offset_type src_offset_type ;
|
||||
|
||||
dst = dst_map_type(
|
||||
dst_handle_type( src.m_handle +
|
||||
( ( i_tile0 + src.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
|
||||
dst_offset_type() );
|
||||
}
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Experimental */
|
||||
} /* namespace Kokkos */
|
||||
@ -162,51 +198,20 @@ public:
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
// Using View with an invalid data type to construct the tiling subview.
|
||||
// View is a friend of View so we use this invalid data type partial specialization
|
||||
// to access implementation of both source and destination view for constructing
|
||||
// the tile subview.
|
||||
|
||||
template< unsigned N0 , unsigned N1 >
|
||||
struct View< void , Kokkos::LayoutTileLeft<N0,N1,true> , void , void >
|
||||
{
|
||||
typedef Kokkos::LayoutTileLeft<N0,N1,true> Layout ;
|
||||
|
||||
template< typename T , class A2 , class A3 >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 >
|
||||
tile_subview( const Kokkos::Experimental::View<T**,Layout,A2,A3> & src
|
||||
, const size_t i_tile0
|
||||
, const size_t i_tile1
|
||||
)
|
||||
{
|
||||
typedef Kokkos::Experimental::View<T**,Layout,A2,A3> SrcView ;
|
||||
typedef Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 > DstView ;
|
||||
|
||||
typedef typename SrcView::map_type::offset_type src_offset_type ;
|
||||
typedef typename DstView::map_type dst_map_type ;
|
||||
typedef typename DstView::map_type::handle_type dst_handle_type ;
|
||||
typedef typename DstView::map_type::offset_type dst_offset_type ;
|
||||
|
||||
return DstView( src.m_track ,
|
||||
dst_map_type(
|
||||
dst_handle_type( src.m_map.m_handle +
|
||||
( ( i_tile0 + src.m_map.m_offset.m_tile_N0 * i_tile1 ) << src_offset_type::SHIFT_T ) ) ,
|
||||
dst_offset_type() )
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
template< typename T , unsigned N0 , unsigned N1 , class A2 , class A3 >
|
||||
template< typename T , unsigned N0 , unsigned N1 , class ... P >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , A2 , A3 >
|
||||
tile_subview( const Kokkos::Experimental::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,A2,A3> & src
|
||||
Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
|
||||
tile_subview( const Kokkos::Experimental::View<T**,Kokkos::LayoutTileLeft<N0,N1,true>,P...> & src
|
||||
, const size_t i_tile0
|
||||
, const size_t i_tile1
|
||||
)
|
||||
{
|
||||
return View< void , Kokkos::LayoutTileLeft<N0,N1,true> , void , void >::
|
||||
tile_subview( src , i_tile0 , i_tile1 );
|
||||
// Force the specialized ViewMapping for extracting a tile
|
||||
// by using the first subview argument as the layout.
|
||||
typedef Kokkos::LayoutTileLeft<N0,N1,true> SrcLayout ;
|
||||
|
||||
return Kokkos::Experimental::View< T[N0][N1] , LayoutLeft , P... >
|
||||
( src , SrcLayout() , i_tile0 , i_tile1 );
|
||||
}
|
||||
|
||||
} /* namespace Experimental */
|
||||
|
||||
@ -43,6 +43,8 @@
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
@ -842,3 +844,5 @@ void * create_singleton( size_t size
|
||||
|
||||
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
|
||||
@ -46,6 +46,8 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
@ -351,7 +353,6 @@ public:
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
// forward declaration for friend classes
|
||||
struct CopyWithoutTracking;
|
||||
struct MallocHelper;
|
||||
|
||||
/// class AllocationTracker
|
||||
@ -544,6 +545,10 @@ public:
|
||||
/// NOT thread-safe
|
||||
void reallocate( size_t size ) const;
|
||||
|
||||
static void disable_tracking();
|
||||
static void enable_tracking();
|
||||
static bool tracking_enabled();
|
||||
|
||||
private:
|
||||
|
||||
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
|
||||
@ -556,31 +561,14 @@ private:
|
||||
void increment_ref_count() const;
|
||||
void decrement_ref_count() const;
|
||||
|
||||
static void disable_tracking();
|
||||
static void enable_tracking();
|
||||
static bool tracking_enabled();
|
||||
|
||||
friend struct Impl::CopyWithoutTracking;
|
||||
friend struct Impl::MallocHelper;
|
||||
|
||||
uintptr_t m_alloc_rec;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/// Make a copy of the functor with reference counting disabled
|
||||
struct CopyWithoutTracking
|
||||
{
|
||||
template <typename Functor>
|
||||
static Functor apply( const Functor & f )
|
||||
{
|
||||
AllocationTracker::disable_tracking();
|
||||
Functor func(f);
|
||||
AllocationTracker::enable_tracking();
|
||||
return func;
|
||||
}
|
||||
};
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
#endif //KOKKOS_ALLOCATION_TRACKER_HPP
|
||||
|
||||
|
||||
@ -427,6 +427,8 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
|
||||
typedef int64_t type;
|
||||
};
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
|
||||
template<class ViewTraits>
|
||||
class ViewDataHandle<
|
||||
@ -457,6 +459,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#ifdef _WIN32
|
||||
|
||||
#define NOMINMAX
|
||||
#include <winsock2.h>
|
||||
#include <Windows.h>
|
||||
|
||||
namespace Kokkos {
|
||||
@ -61,7 +62,6 @@ namespace Kokkos {
|
||||
};
|
||||
}
|
||||
|
||||
#ifdef KOKKOS_HAVE_CXX11
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_compare_exchange(volatile T * const dest, const T & compare,
|
||||
@ -103,10 +103,18 @@ namespace Kokkos {
|
||||
KOKKOS_INLINE_FUNCTION U() {};
|
||||
} tmp, newval;
|
||||
newval.t = val;
|
||||
tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare));
|
||||
_InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, ((LONGLONG*)&compare));
|
||||
tmp.t = dest;
|
||||
return tmp.t;
|
||||
}
|
||||
|
||||
template < typename T >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
T atomic_compare_exchange_strong(volatile T * const dest, const T & compare, const T & val)
|
||||
{
|
||||
return atomic_compare_exchange(dest,compare,val);
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
T atomic_fetch_or(volatile T * const dest, const T val) {
|
||||
T oldval = *dest;
|
||||
@ -147,7 +155,20 @@ namespace Kokkos {
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
T atomic_fetch_exchange(volatile T * const dest, const T val) {
|
||||
T atomic_fetch_sub(volatile T * const dest, const T val) {
|
||||
T oldval = *dest;
|
||||
T assume;
|
||||
do {
|
||||
assume = oldval;
|
||||
T newval = val - oldval;
|
||||
oldval = atomic_compare_exchange(dest, assume, newval);
|
||||
} while (assume != oldval);
|
||||
|
||||
return oldval;
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
T atomic_exchange(volatile T * const dest, const T val) {
|
||||
T oldval = *dest;
|
||||
T assume;
|
||||
do {
|
||||
@ -174,8 +195,8 @@ namespace Kokkos {
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
void atomic_exchange(volatile T * const dest, const T val) {
|
||||
atomic_fetch_exchange(dest, val);
|
||||
void atomic_sub(volatile T * const dest, const T val) {
|
||||
atomic_fetch_sub(dest, val);
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
@ -208,4 +229,4 @@ namespace Kokkos {
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@ -43,6 +43,8 @@
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
#include <impl/Kokkos_BasicAllocators.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
@ -50,8 +52,11 @@
|
||||
#include <stdint.h> // uintptr_t
|
||||
#include <cstdlib> // for malloc, realloc, and free
|
||||
#include <cstring> // for memcpy
|
||||
|
||||
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
|
||||
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
|
||||
@ -103,8 +108,7 @@ void * raw_aligned_allocate( size_t size, size_t alignment )
|
||||
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
|
||||
ptr = _mm_malloc( size , alignment );
|
||||
|
||||
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
|
||||
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
|
||||
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
|
||||
posix_memalign( & ptr, alignment , size );
|
||||
|
||||
@ -136,8 +140,7 @@ void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
|
||||
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
|
||||
_mm_free( ptr );
|
||||
|
||||
#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
|
||||
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
|
||||
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
free( ptr );
|
||||
#else
|
||||
// get the alloc'd pointer
|
||||
@ -279,3 +282,6 @@ void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t
|
||||
}
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
#define KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
|
||||
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
|
||||
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
@ -113,6 +114,8 @@ public:
|
||||
|
||||
}} // namespace Kokkos::Impl
|
||||
|
||||
#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
|
||||
|
||||
#endif //KOKKOS_BASIC_ALLOCATORS_HPP
|
||||
|
||||
|
||||
|
||||
@ -67,6 +67,13 @@ bool is_unsigned_int(const char* str)
|
||||
|
||||
void initialize_internal(const InitArguments& args)
|
||||
{
|
||||
// This is an experimental setting
|
||||
// For KNL in Flat mode this variable should be set, so that
|
||||
// memkind allocates high bandwidth memory correctly.
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
#endif
|
||||
|
||||
// Protect declarations, to prevent "unused variable" warnings.
|
||||
#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
|
||||
const int num_threads = args.num_threads;
|
||||
|
||||
@ -61,7 +61,7 @@ void host_abort( const char * const message )
|
||||
{
|
||||
fwrite(message,1,strlen(message),stderr);
|
||||
fflush(stderr);
|
||||
abort();
|
||||
::abort();
|
||||
}
|
||||
|
||||
void throw_runtime_exception( const std::string & msg )
|
||||
|
||||
@ -46,6 +46,10 @@
|
||||
|
||||
#include <string>
|
||||
#include <iosfwd>
|
||||
#include <KokkosCore_config.h>
|
||||
#ifdef KOKKOS_HAVE_CUDA
|
||||
#include <Cuda/Kokkos_Cuda_abort.hpp>
|
||||
#endif
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
108
lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp
Normal file
108
lib/kokkos/core/src/impl/Kokkos_HBWAllocators.cpp
Normal file
@ -0,0 +1,108 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
#include <impl/Kokkos_HBWAllocators.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
|
||||
#include <stdint.h> // uintptr_t
|
||||
#include <cstdlib> // for malloc, realloc, and free
|
||||
#include <cstring> // for memcpy
|
||||
|
||||
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
|
||||
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
|
||||
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
|
||||
#endif
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
#include <memkind.h>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
#define MEMKIND_TYPE MEMKIND_HBW //hbw_get_kind(HBW_PAGESIZE_4KB)
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
void* HBWMallocAllocator::allocate( size_t size )
|
||||
{
|
||||
std::cout<< "Allocate HBW: " << 1.0e-6*size << "MB" << std::endl;
|
||||
void * ptr = NULL;
|
||||
if (size) {
|
||||
ptr = memkind_malloc(MEMKIND_TYPE,size);
|
||||
|
||||
if (!ptr)
|
||||
{
|
||||
std::ostringstream msg ;
|
||||
msg << name() << ": allocate(" << size << ") FAILED";
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void HBWMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
|
||||
{
|
||||
if (ptr) {
|
||||
memkind_free(MEMKIND_TYPE,ptr);
|
||||
}
|
||||
}
|
||||
|
||||
void * HBWMallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
|
||||
{
|
||||
void * ptr = memkind_realloc(MEMKIND_TYPE, old_ptr, new_size);
|
||||
|
||||
if (new_size > 0u && ptr == NULL) {
|
||||
Kokkos::Impl::throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
#endif
|
||||
75
lib/kokkos/core/src/impl/Kokkos_HBWAllocators.hpp
Normal file
75
lib/kokkos/core/src/impl/Kokkos_HBWAllocators.hpp
Normal file
@ -0,0 +1,75 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_HBW_ALLOCATORS_HPP
|
||||
#define KOKKOS_HBW_ALLOCATORS_HPP
|
||||
|
||||
#ifdef KOKKOS_HAVE_HBWSPACE
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
/// class MallocAllocator
|
||||
class HBWMallocAllocator
|
||||
{
|
||||
public:
|
||||
static const char * name()
|
||||
{
|
||||
return "HBW Malloc Allocator";
|
||||
}
|
||||
|
||||
static void* allocate(size_t size);
|
||||
|
||||
static void deallocate(void * ptr, size_t size);
|
||||
|
||||
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace Kokkos::Impl
|
||||
#endif //KOKKOS_HAVE_HBWSPACE
|
||||
#endif //KOKKOS_HBW_ALLOCATORS_HPP
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user