Updating Kokkos lib to 2.03.00

This commit is contained in:
Stan Moore
2017-04-25 13:48:51 -06:00
parent 9f6e126a2f
commit 8910ec6e59
261 changed files with 27816 additions and 17799 deletions

View File

@ -1,5 +1,28 @@
# Change Log
## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
**Implemented enhancements:**
- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
- Turning off IVDEP for Intel 14. [\#638](https://github.com/kokkos/kokkos/issues/638)
- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
**Fixed bugs:**
- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)

View File

@ -98,10 +98,10 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
)
TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_QTHREAD
KOKKOS_HAVE_QTHREAD
"Enable QTHREAD support in Kokkos."
"${TPL_ENABLE_QTHREAD}"
Kokkos_ENABLE_Qthreads
KOKKOS_HAVE_QTHREADS
"Enable Qthreads support in Kokkos."
"${TPL_ENABLE_QTHREADS}"
)
TRIBITS_ADD_OPTION_AND_DEFINE(
@ -213,4 +213,3 @@ TRIBITS_EXCLUDE_FILES(
)
TRIBITS_PACKAGE_POSTPROCESS()

View File

@ -1,39 +1,38 @@
# Default settings common options
# Default settings common options.
#LAMMPS specific settings:
KOKKOS_PATH=../../lib/kokkos
CXXFLAGS=$(CCFLAGS)
#Options: OpenMP,Serial,Pthreads,Cuda
# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
KOKKOS_ARCH ?= ""
#Options: yes,no
# Options: yes,no
KOKKOS_DEBUG ?= "no"
#Options: hwloc,librt,experimental_memkind
# Options: hwloc,librt,experimental_memkind
KOKKOS_USE_TPLS ?= ""
#Options: c++11,c++1z
# Options: c++11,c++1z
KOKKOS_CXX_STANDARD ?= "c++11"
#Options: aggressive_vectorization,disable_profiling
# Options: aggressive_vectorization,disable_profiling
KOKKOS_OPTIONS ?= ""
#Default settings specific options
#Options: force_uvm,use_ldg,rdc,enable_lambda
# Default settings specific options.
# Options: force_uvm,use_ldg,rdc,enable_lambda
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
# Check for general settings
# Check for general settings.
KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
# Check for external libraries
# Check for external libraries.
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
# Check for advanced settings
# Check for advanced settings.
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
@ -41,21 +40,21 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
# Check for Kokkos Host Execution Spaces one of which must be on
# Check for Kokkos Host Execution Spaces one of which must be on.
KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
KOKKOS_INTERNAL_USE_SERIAL := 1
endif
endif
endif
# Check for other Execution Spaces
# Check for other Execution Spaces.
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
@ -64,15 +63,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
endif
# Check OS
# Check OS.
KOKKOS_OS := $(shell uname -s)
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname -s | grep CYGWIN | wc -l)
KOKKOS_INTERNAL_OS_LINUX := $(shell uname -s | grep Linux | wc -l)
KOKKOS_INTERNAL_OS_DARWIN := $(shell uname -s | grep Darwin | wc -l)
# Check compiler
# Check compiler.
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
@ -95,6 +92,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
$(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
@ -103,7 +101,6 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
else
@ -114,7 +111,7 @@ else
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# OpenMP is turned on by default in Cray compiler environment
# OpenMP is turned on by default in Cray compiler environment.
KOKKOS_INTERNAL_OPENMP_FLAG :=
else
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
@ -138,9 +135,9 @@ else
endif
endif
# Check for Kokkos Architecture settings
# Check for Kokkos Architecture settings.
#Intel based
# Intel based.
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
@ -148,7 +145,7 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW |
KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
#NVIDIA based
# NVIDIA based.
NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
@ -170,9 +167,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
@ -183,33 +180,33 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
endif
#ARM based
# ARM based.
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
#IBM based
# IBM based.
KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
#AMD based
# AMD based.
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
#Any AVX?
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
# Decide what ISA level we are able to support
# Decide what ISA level we are able to support.
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
#Incompatible flags?
# Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
@ -220,7 +217,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
$(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
endif
#Generating the list of Flags
# Generating the list of Flags.
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
@ -236,15 +233,19 @@ KOKKOS_LDFLAGS = -L$(shell pwd)
KOKKOS_SRC =
KOKKOS_HEADERS =
#Generating the KokkosCore_config.h file
# Generating the KokkosCore_config.h file.
tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
tmp := $(shell date >> KokkosCore_config.tmp)
tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp)
endif
@ -253,12 +254,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
@ -279,12 +280,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib
tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
endif
tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
@ -341,6 +336,7 @@ endif
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
endif
@ -365,16 +361,19 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
endif
endif
endif
#Add Architecture flags
# Add Architecture flags.
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
@ -391,6 +390,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
@ -408,6 +408,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
@ -424,6 +425,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
@ -435,7 +437,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
KOKKOS_CXXFLAGS += -tp=sandybridge
KOKKOS_LDFLAGS += -tp=sandybridge
else
# Assume that this is a really a GNU compiler
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
endif
@ -445,10 +447,11 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P8
# Assume that this is a really a GNU compiler or it could be XL on P8.
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
@ -456,10 +459,11 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P9
# Assume that this is a really a GNU compiler or it could be XL on P9.
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
endif
@ -467,6 +471,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX2
KOKKOS_LDFLAGS += -xCORE-AVX2
@ -478,7 +483,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
KOKKOS_CXXFLAGS += -tp=haswell
KOKKOS_LDFLAGS += -tp=haswell
else
# Assume that this is a really a GNU compiler
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
endif
@ -488,6 +493,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xMIC-AVX512
KOKKOS_LDFLAGS += -xMIC-AVX512
@ -498,7 +504,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Asssume that this is really a GNU compiler
# Asssume that this is really a GNU compiler.
KOKKOS_CXXFLAGS += -march=knl
KOKKOS_LDFLAGS += -march=knl
endif
@ -508,6 +514,7 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX512
KOKKOS_LDFLAGS += -xCORE-AVX512
@ -518,7 +525,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Nothing here yet
# Nothing here yet.
KOKKOS_CXXFLAGS += -march=skylake-avx512
KOKKOS_LDFLAGS += -march=skylake-avx512
endif
@ -532,67 +539,79 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
KOKKOS_LDFLAGS += -mmic
endif
#Figure out the architecture flag for Cuda
# Figure out the architecture flag for Cuda.
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=--cuda-gpu-arch
KOKKOS_CXXFLAGS += -x cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
endif
endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
else
KOKKOS_INTERNAL_NEW_CONFIG := 1
KOKKOS_INTERNAL_NEW_CONFIG := 1
endif
ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
@ -616,30 +635,34 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_LIBS += -lcudart -lcuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
KOKKOS_LIBS += -lpthread
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
KOKKOS_LIBS += -lqthread
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
else
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
endif
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
endif
#Explicitly set the GCC Toolchain for Clang
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
KOKKOS_LIBS += -lpthread
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
KOKKOS_LIBS += -lqthread
endif
# Explicitly set the GCC Toolchain for Clang.
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
@ -647,15 +670,15 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
endif
#With Cygwin functions such as fdopen and fileno are not defined
#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
#though. So we hard undefine it here. Not sure if that has any bad side effects
#This is needed for gtest actually, not for Kokkos itself!
# With Cygwin functions such as fdopen and fileno are not defined
# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
# though. So we hard undefine it here. Not sure if that has any bad side effects
# This is needed for gtest actually, not for Kokkos itself!
ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
endif
# Setting up dependencies
# Setting up dependencies.
KokkosCore_config.h:

View File

@ -18,6 +18,8 @@ Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
@ -43,11 +45,11 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -59,4 +61,3 @@ endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp

View File

@ -45,31 +45,39 @@ Primary tested compilers on X86 are:
GCC 4.8.4
GCC 4.9.2
GCC 5.1.0
GCC 5.2.0
Intel 14.0.4
Intel 15.0.2
Intel 16.0.1
Intel 17.0.098
Intel 17.1.132
Clang 3.5.2
Clang 3.6.1
Clang 3.7.1
Clang 3.8.1
Clang 3.9.0
PGI 17.1
Primary tested compilers on Power 8 are:
GCC 5.4.0 (OpenMP,Serial)
IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
Primary tested compilers on Intel KNL are:
GCC 6.2.0
Intel 16.2.181 (with gcc 4.7.2)
Intel 17.0.098 (with gcc 4.7.2)
Intel 17.1.132 (with gcc 4.9.3)
Intel 17.2.174 (with gcc 4.9.3)
Intel 18.0.061 (beta) (with gcc 4.9.3)
Secondary tested compilers are:
CUDA 7.0 (with gcc 4.7.2)
CUDA 7.5 (with gcc 4.7.2)
CUDA 7.0 (with gcc 4.8.4)
CUDA 7.5 (with gcc 4.8.4)
CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
CUDA/Clang 8.0 using Clang/Trunk compiler
Other compilers working:
X86:
PGI 15.4
Cygwin 2.1.0 64bit with gcc 4.9.3
Known non-working combinations:

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -547,7 +547,7 @@ namespace Kokkos {
KOKKOS_INLINE_FUNCTION
Random_XorShift64 (uint64_t state, int state_idx = 0)
: state_(state),state_idx_(state_idx){}
: state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
@ -719,6 +719,9 @@ namespace Kokkos {
}
void init(uint64_t seed, int num_states) {
if(seed==0)
seed = uint64_t(1318319);
num_states_ = num_states;
locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
@ -968,8 +971,9 @@ namespace Kokkos {
inline
void init(uint64_t seed, int num_states) {
if(seed==0)
seed = uint64_t(1318319);
num_states_ = num_states;
locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);

View File

@ -53,69 +53,122 @@ namespace Kokkos {
namespace Impl {
template<class ValuesViewType, int Rank=ValuesViewType::Rank>
template< class DstViewType , class SrcViewType
, int Rank = DstViewType::Rank >
struct CopyOp;
template<class ValuesViewType>
struct CopyOp<ValuesViewType,1> {
template<class DstType, class SrcType>
template< class DstViewType , class SrcViewType >
struct CopyOp<DstViewType,SrcViewType,1> {
KOKKOS_INLINE_FUNCTION
static void copy(DstType& dst, size_t i_dst,
SrcType& src, size_t i_src ) {
static void copy(DstViewType const& dst, size_t i_dst,
SrcViewType const& src, size_t i_src ) {
dst(i_dst) = src(i_src);
}
};
template<class ValuesViewType>
struct CopyOp<ValuesViewType,2> {
template<class DstType, class SrcType>
template< class DstViewType , class SrcViewType >
struct CopyOp<DstViewType,SrcViewType,2> {
KOKKOS_INLINE_FUNCTION
static void copy(DstType& dst, size_t i_dst,
SrcType& src, size_t i_src ) {
for(int j = 0;j< (int) dst.dimension_1(); j++)
static void copy(DstViewType const& dst, size_t i_dst,
SrcViewType const& src, size_t i_src ) {
for(int j = 0;j< (int) dst.extent(1); j++)
dst(i_dst,j) = src(i_src,j);
}
};
template<class ValuesViewType>
struct CopyOp<ValuesViewType,3> {
template<class DstType, class SrcType>
template< class DstViewType , class SrcViewType >
struct CopyOp<DstViewType,SrcViewType,3> {
KOKKOS_INLINE_FUNCTION
static void copy(DstType& dst, size_t i_dst,
SrcType& src, size_t i_src ) {
for(int j = 0; j<dst.dimension_1(); j++)
for(int k = 0; k<dst.dimension_2(); k++)
static void copy(DstViewType const& dst, size_t i_dst,
SrcViewType const& src, size_t i_src ) {
for(int j = 0; j<dst.extent(1); j++)
for(int k = 0; k<dst.extent(2); k++)
dst(i_dst,j,k) = src(i_src,j,k);
}
};
}
template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
class SizeType = typename KeyViewType::memory_space::size_type>
//----------------------------------------------------------------------------
template< class KeyViewType
, class BinSortOp
, class Space = typename KeyViewType::device_type
, class SizeType = typename KeyViewType::memory_space::size_type
>
class BinSort {
public:
template<class ValuesViewType, class PermuteViewType, class CopyOp>
struct bin_sort_sort_functor {
typedef ExecutionSpace execution_space;
typedef typename ValuesViewType::non_const_type values_view_type;
typedef typename ValuesViewType::const_type const_values_view_type;
Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
values_view_type sorted_values;
typename PermuteViewType::const_type sort_order;
bin_sort_sort_functor(const_values_view_type values_, values_view_type sorted_values_, PermuteViewType sort_order_):
values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
template< class DstViewType , class SrcViewType >
struct copy_functor {
typedef typename SrcViewType::const_type src_view_type ;
typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
DstViewType dst_values ;
src_view_type src_values ;
int dst_offset ;
copy_functor( DstViewType const & dst_values_
, int const & dst_offset_
, SrcViewType const & src_values_
)
: dst_values( dst_values_ )
, src_values( src_values_ )
, dst_offset( dst_offset_ )
{}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
//printf("Sort: %i %i\n",i,sort_order(i));
CopyOp::copy(sorted_values,i,values,sort_order(i));
// printf("copy: dst(%i) src(%i)\n",i+dst_offset,i);
copy_op::copy(dst_values,i+dst_offset,src_values,i);
}
};
typedef ExecutionSpace execution_space;
template< class DstViewType
, class PermuteViewType
, class SrcViewType
>
struct copy_permute_functor {
// If a Kokkos::View then can generate constant random access
// otherwise can only use the constant type.
typedef typename std::conditional
< Kokkos::is_view< SrcViewType >::value
, Kokkos::View< typename SrcViewType::const_data_type
, typename SrcViewType::array_layout
, typename SrcViewType::device_type
, Kokkos::MemoryTraits<Kokkos::RandomAccess>
>
, typename SrcViewType::const_type
>::type src_view_type ;
typedef typename PermuteViewType::const_type perm_view_type ;
typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
DstViewType dst_values ;
perm_view_type sort_order ;
src_view_type src_values ;
copy_permute_functor( DstViewType const & dst_values_
, PermuteViewType const & sort_order_
, SrcViewType const & src_values_
)
: dst_values( dst_values_ )
, sort_order( sort_order_ )
, src_values( src_values_ )
{}
KOKKOS_INLINE_FUNCTION
void operator() (const int& i) const {
// printf("copy_permute: dst(%i) src(%i)\n",i,sort_order(i));
copy_op::copy(dst_values,i,src_values,sort_order(i));
}
};
typedef typename Space::execution_space execution_space;
typedef BinSortOp bin_op_type;
struct bin_count_tag {};
@ -124,84 +177,137 @@ public:
struct bin_sort_bins_tag {};
public:
typedef SizeType size_type;
typedef size_type value_type;
typedef Kokkos::View<size_type*, execution_space> offset_type;
typedef Kokkos::View<const int*, execution_space> bin_count_type;
typedef Kokkos::View<size_type*, Space> offset_type;
typedef Kokkos::View<const int*, Space> bin_count_type;
typedef typename KeyViewType::const_type const_key_view_type ;
typedef Kokkos::View<typename KeyViewType::const_data_type,
// If a Kokkos::View then can generate constant random access
// otherwise can only use the constant type.
typedef typename std::conditional
< Kokkos::is_view< KeyViewType >::value
, Kokkos::View< typename KeyViewType::const_data_type,
typename KeyViewType::array_layout,
typename KeyViewType::memory_space> const_key_view_type;
typedef Kokkos::View<typename KeyViewType::const_data_type,
typename KeyViewType::array_layout,
typename KeyViewType::memory_space,
Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
typename KeyViewType::device_type,
Kokkos::MemoryTraits<Kokkos::RandomAccess> >
, const_key_view_type
>::type const_rnd_key_view_type;
typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
typedef typename KeyViewType::const_value_type const_key_scalar;
typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
private:
const_key_view_type keys;
const_rnd_key_view_type keys_rnd;
public:
BinSortOp bin_op;
offset_type bin_offsets;
BinSortOp bin_op ;
offset_type bin_offsets ;
bin_count_atomic_type bin_count_atomic ;
bin_count_type bin_count_const ;
offset_type sort_order ;
Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
bin_count_type bin_count_const;
offset_type sort_order;
bool sort_within_bins;
int range_begin ;
int range_end ;
bool sort_within_bins ;
public:
// Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
BinSort(const_key_view_type keys_, BinSortOp bin_op_,
bool sort_within_bins_ = false)
:keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
BinSort() {}
bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
//----------------------------------------
// Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
BinSort( const_key_view_type keys_
, int range_begin_
, int range_end_
, BinSortOp bin_op_
, bool sort_within_bins_ = false
)
: keys(keys_)
, keys_rnd(keys_)
, bin_op(bin_op_)
, bin_offsets()
, bin_count_atomic()
, bin_count_const()
, sort_order()
, range_begin( range_begin_ )
, range_end( range_end_ )
, sort_within_bins( sort_within_bins_ )
{
bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
bin_count_const = bin_count_atomic;
bin_offsets = offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
sort_order = offset_type("PermutationVector",keys.dimension_0());
sort_within_bins = sort_within_bins_;
sort_order = offset_type("PermutationVector",range_end-range_begin);
}
BinSort( const_key_view_type keys_
, BinSortOp bin_op_
, bool sort_within_bins_ = false
)
: BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
//----------------------------------------
// Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
void create_permute_vector() {
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag> (0,keys.dimension_0()),*this);
Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag> (0,bin_op.max_bins()) ,*this);
const size_t len = range_end - range_begin ;
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_count_tag> (0,len),*this);
Kokkos::parallel_scan(Kokkos::RangePolicy<execution_space,bin_offset_tag> (0,bin_op.max_bins()) ,*this);
Kokkos::deep_copy(bin_count_atomic,0);
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag> (0,keys.dimension_0()),*this);
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_binning_tag> (0,len),*this);
if(sort_within_bins)
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
}
// Sort a view with respect ot the first dimension using the permutation array
template<class ValuesViewType>
void sort(ValuesViewType values) {
ValuesViewType sorted_values = ValuesViewType("Copy",
values.dimension_0(),
values.dimension_1(),
values.dimension_2(),
values.dimension_3(),
values.dimension_4(),
values.dimension_5(),
values.dimension_6(),
values.dimension_7());
void sort( ValuesViewType const & values)
{
typedef
Kokkos::View< typename ValuesViewType::data_type,
typename ValuesViewType::array_layout,
typename ValuesViewType::device_type >
scratch_view_type ;
parallel_for(values.dimension_0(),
bin_sort_sort_functor<ValuesViewType, offset_type,
Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
const size_t len = range_end - range_begin ;
deep_copy(values,sorted_values);
scratch_view_type
sorted_values("Scratch",
len,
values.extent(1),
values.extent(2),
values.extent(3),
values.extent(4),
values.extent(5),
values.extent(6),
values.extent(7));
{
copy_permute_functor< scratch_view_type /* DstViewType */
, offset_type /* PermuteViewType */
, ValuesViewType /* SrcViewType */
>
functor( sorted_values , sort_order , values );
parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
}
{
copy_functor< ValuesViewType , scratch_view_type >
functor( values , range_begin , sorted_values );
parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
}
}
// Get the permutation vector
@ -217,9 +323,11 @@ public:
bin_count_type get_bin_count() const {return bin_count_const;}
public:
KOKKOS_INLINE_FUNCTION
void operator() (const bin_count_tag& tag, const int& i) const {
bin_count_atomic(bin_op.bin(keys,i))++;
const int j = range_begin + i ;
bin_count_atomic(bin_op.bin(keys,j))++;
}
KOKKOS_INLINE_FUNCTION
@ -232,10 +340,11 @@ public:
KOKKOS_INLINE_FUNCTION
void operator() (const bin_binning_tag& tag, const int& i) const {
const int bin = bin_op.bin(keys,i);
const int j = range_begin + i ;
const int bin = bin_op.bin(keys,j);
const int count = bin_count_atomic(bin)++;
sort_order(bin_offsets(bin) + count) = i;
sort_order(bin_offsets(bin) + count) = j ;
}
KOKKOS_INLINE_FUNCTION
@ -262,13 +371,19 @@ public:
}
};
//----------------------------------------------------------------------------
template<class KeyViewType>
struct BinOp1D {
const int max_bins_;
const double mul_;
int max_bins_;
double mul_;
typename KeyViewType::const_value_type range_;
typename KeyViewType::const_value_type min_;
BinOp1D():max_bins_(0),mul_(0.0),
range_(typename KeyViewType::const_value_type()),
min_(typename KeyViewType::const_value_type()) {}
//Construct BinOp with number of bins, minimum value and maxuimum value
BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
typename KeyViewType::const_value_type max )
@ -302,12 +417,14 @@ struct BinOp3D {
typename KeyViewType::non_const_value_type range_[3];
typename KeyViewType::non_const_value_type min_[3];
BinOp3D() {}
BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
typename KeyViewType::const_value_type max[] )
{
max_bins_[0] = max_bins__[0]+1;
max_bins_[1] = max_bins__[1]+1;
max_bins_[2] = max_bins__[2]+1;
max_bins_[0] = max_bins__[0];
max_bins_[1] = max_bins__[1];
max_bins_[2] = max_bins__[2];
mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
@ -364,7 +481,7 @@ bool try_std_sort(ViewType view) {
possible = possible && (ViewType::Rank == 1);
possible = possible && (stride[0] == 1);
if(possible) {
std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
std::sort(view.data(),view.data()+view.extent(0));
}
return possible;
}
@ -386,7 +503,8 @@ struct min_max_functor {
}
template<class ViewType>
void sort(ViewType view, bool always_use_kokkos_sort = false) {
void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
{
if(!always_use_kokkos_sort) {
if(Impl::try_std_sort(view)) return;
}
@ -394,14 +512,37 @@ void sort(ViewType view, bool always_use_kokkos_sort = false) {
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
Impl::min_max_functor<ViewType>(view),reducer);
if(result.min_val == result.max_val) return;
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
bin_sort.create_permute_vector();
bin_sort.sort(view);
}
template<class ViewType>
void sort( ViewType view
, size_t const begin
, size_t const end
)
{
typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
typedef BinOp1D<ViewType> CompType;
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
parallel_reduce( range_policy( begin , end )
, Impl::min_max_functor<ViewType>(view),reducer );
if(result.min_val == result.max_val) return;
BinSort<ViewType, CompType>
bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
bin_sort.create_permute_vector();
bin_sort.sort(view);
}
}
#endif

View File

@ -44,6 +44,7 @@
#include <gtest/gtest.h>
#include<Kokkos_Core.hpp>
#include<Kokkos_DynamicView.hpp>
#include<Kokkos_Random.hpp>
#include<Kokkos_Sort.hpp>
@ -192,17 +193,81 @@ void test_3D_sort(unsigned int n) {
double epsilon = 1e-10;
unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
if ( sort_fails )
printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
ASSERT_EQ(sort_fails,0);
ASSERT_EQ(equal_sum,1);
}
//----------------------------------------------------------------------------
template<class ExecutionSpace, typename KeyType>
void test_dynamic_view_sort(unsigned int n )
{
typedef typename ExecutionSpace::memory_space memory_space ;
typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
const size_t upper_bound = 2 * n ;
typename KeyDynamicViewType::memory_pool
pool( memory_space() , 2 * n * sizeof(KeyType) );
KeyDynamicViewType keys("Keys",pool,upper_bound);
keys.resize_serial(n);
KeyViewType keys_view("KeysTmp", n );
// Test sorting array with all numbers equal
Kokkos::deep_copy(keys_view,KeyType(1));
Kokkos::Experimental::deep_copy(keys,keys_view);
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
Kokkos::Experimental::deep_copy(keys,keys_view);
double sum_before = 0.0;
double sum_after = 0.0;
unsigned int sort_fails = 0;
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
Kokkos::Experimental::deep_copy( keys_view , keys );
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
double ratio = sum_before/sum_after;
double epsilon = 1e-10;
unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
if ( sort_fails != 0 || equal_sum != 1 ) {
std::cout << " N = " << n
<< " ; sum_before = " << sum_before
<< " ; sum_after = " << sum_after
<< " ; ratio = " << ratio
<< std::endl ;
}
ASSERT_EQ(sort_fails,0);
ASSERT_EQ(equal_sum,1);
}
//----------------------------------------------------------------------------
template<class ExecutionSpace, typename KeyType>
void test_sort(unsigned int N)
{
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
test_3D_sort<ExecutionSpace,KeyType>(N);
test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
}
}

View File

@ -140,6 +140,9 @@ do
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
-pedantic|-Wpedantic|-ansi)
;;
#strip of -Woverloaded-virtual to avoid "cc1: warning: command line option -Woverloaded-virtual is valid for C++/ObjC++ but not for C"
-Woverloaded-virtual)
;;
#strip -Xcompiler because we add it
-Xcompiler)
if [ $first_xcompiler_arg -eq 1 ]; then
@ -190,7 +193,7 @@ do
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
*.dylib)
@*|*.dylib)
object_files="$object_files -Xlinker $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1"
;;

View File

@ -63,8 +63,7 @@
# Source: https://code.google.com/p/qthreads
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
REQUIRED_HEADERS qthread.h
REQUIRED_LIBS_NAMES "qthread"
)

View File

@ -63,8 +63,7 @@
# Source: https://code.google.com/p/qthreads
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
REQUIRED_HEADERS qthread.h
REQUIRED_LIBS_NAMES "qthread"
)

View File

@ -6,7 +6,7 @@
#-----------------------------------------------------------------------------
# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
#
# Cuda, OpenMP, Threads, Qthread, hwloc
# Cuda, OpenMP, Threads, Qthreads, hwloc
#
# module loaded on 'kokkos-dev.sandia.gov' for this build
#
@ -82,13 +82,13 @@ CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
#-----------------------------------------------------------------------------
# Qthread
# Qthreads
QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
QTHREADS_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREADS:BOOL=ON"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_INCLUDE_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/include"
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_LIBRARY_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/lib"
#-----------------------------------------------------------------------------
# C++11
@ -108,6 +108,3 @@ rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
#-----------------------------------------------------------------------------

View File

@ -5,3 +5,4 @@ tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e
tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304
tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641

View File

@ -6,7 +6,7 @@
set -o pipefail
# Determine current machine
# Determine current machine.
MACHINE=""
HOSTNAME=$(hostname)
@ -45,10 +45,11 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CUDA_WARNING_FLAGS=""
# Default. Machine specific can override
# Default. Machine specific can override.
DEBUG=False
ARGS=""
CUSTOM_BUILD_LIST=""
QTHREADS_PATH=""
DRYRUN=False
BUILD_ONLY=False
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
@ -60,74 +61,78 @@ PRINT_HELP=False
OPT_FLAG=""
KOKKOS_OPTIONS=""
#
# Handle arguments
# Handle arguments.
#
while [[ $# > 0 ]]
do
key="$1"
case $key in
--kokkos-path*)
KOKKOS_PATH="${key#*=}"
;;
--build-list*)
CUSTOM_BUILD_LIST="${key#*=}"
;;
--debug*)
DEBUG=True
;;
--build-only*)
BUILD_ONLY=True
;;
--test-script*)
TEST_SCRIPT=True
;;
--skip-hwloc*)
SKIP_HWLOC=True
;;
--num*)
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
;;
--dry-run*)
DRYRUN=True
;;
--spot-check*)
SPOT_CHECK=True
;;
--arch*)
ARCH_FLAG="--arch=${key#*=}"
;;
--opt-flag*)
OPT_FLAG="${key#*=}"
;;
--with-cuda-options*)
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
;;
--help*)
PRINT_HELP=True
;;
*)
# args, just append
ARGS="$ARGS $1"
;;
esac
shift
key="$1"
case $key in
--kokkos-path*)
KOKKOS_PATH="${key#*=}"
;;
--qthreads-path*)
QTHREADS_PATH="${key#*=}"
;;
--build-list*)
CUSTOM_BUILD_LIST="${key#*=}"
;;
--debug*)
DEBUG=True
;;
--build-only*)
BUILD_ONLY=True
;;
--test-script*)
TEST_SCRIPT=True
;;
--skip-hwloc*)
SKIP_HWLOC=True
;;
--num*)
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
;;
--dry-run*)
DRYRUN=True
;;
--spot-check*)
SPOT_CHECK=True
;;
--arch*)
ARCH_FLAG="--arch=${key#*=}"
;;
--opt-flag*)
OPT_FLAG="${key#*=}"
;;
--with-cuda-options*)
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
;;
--help*)
PRINT_HELP=True
;;
*)
# args, just append
ARGS="$ARGS $1"
;;
esac
shift
done
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
# set kokkos path
# Set kokkos path.
if [ -z "$KOKKOS_PATH" ]; then
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
else
# Ensure KOKKOS_PATH is abs path
# Ensure KOKKOS_PATH is abs path.
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
fi
#
# Machine specific config
# Machine specific config.
#
if [ "$MACHINE" = "sems" ]; then
@ -153,21 +158,17 @@ if [ "$MACHINE" = "sems" ]; then
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
fi
elif [ "$MACHINE" = "white" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
@ -177,7 +178,7 @@ elif [ "$MACHINE" = "white" ]; then
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
# Don't do pthread on white
# Don't do pthread on white.
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
@ -185,9 +186,11 @@ elif [ "$MACHINE" = "white" ]; then
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=Power8,Kepler37"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
elif [ "$MACHINE" = "bowman" ]; then
@ -300,14 +303,14 @@ elif [ "$MACHINE" = "apollo" ]; then
if [ -z "$ARCH_FLAG" ]; then
ARCH_FLAG="--arch=SNB,Kepler35"
fi
NUM_JOBS_TO_RUN_IN_PARALLEL=2
else
echo "Unhandled machine $MACHINE" >&2
exit 1
fi
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
@ -315,76 +318,78 @@ declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
if [ "$PRINT_HELP" = "True" ]; then
echo "test_all_sandia <ARGS> <OPTIONS>:"
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script"
echo "--debug: Run tests in debug. Defaults to False"
echo "--test-script: Test this script, not Kokkos"
echo "--skip-hwloc: Do not do hwloc tests"
echo "--num=N: Number of jobs to run in parallel"
echo "--spot-check: Minimal test set to issue pull request"
echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything"
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
echo "--arch=ARCHITECTURE: overwrite architecture flags"
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
echo "--build-list=BUILD,BUILD,BUILD..."
echo " Provide a comma-separated list of builds instead of running all builds"
echo " Valid items:"
echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
echo ""
echo "test_all_sandia <ARGS> <OPTIONS>:"
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script"
echo "--debug: Run tests in debug. Defaults to False"
echo "--test-script: Test this script, not Kokkos"
echo "--skip-hwloc: Do not do hwloc tests"
echo "--num=N: Number of jobs to run in parallel"
echo "--spot-check: Minimal test set to issue pull request"
echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything"
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
echo "--arch=ARCHITECTURE: overwrite architecture flags"
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
echo "--build-list=BUILD,BUILD,BUILD..."
echo " Provide a comma-separated list of builds instead of running all builds"
echo " Valid items:"
echo " OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
echo " Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
echo ""
echo "ARGS: list of expressions matching compilers to test"
echo " supported compilers sems"
for COMPILER_DATA in "${COMPILERS[@]}"; do
echo "ARGS: list of expressions matching compilers to test"
echo " supported compilers sems"
for COMPILER_DATA in "${COMPILERS[@]}"; do
ARR=($COMPILER_DATA)
COMPILER=${ARR[0]}
echo " $COMPILER"
done
echo ""
done
echo ""
echo "Examples:"
echo " Run all tests"
echo " % test_all_sandia"
echo ""
echo " Run all gcc tests"
echo " % test_all_sandia gcc"
echo ""
echo " Run all gcc/4.7.2 and all intel tests"
echo " % test_all_sandia gcc/4.7.2 intel"
echo ""
echo " Run all tests in debug"
echo " % test_all_sandia --debug"
echo ""
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
echo ""
echo "If you want to kill the tests, do:"
echo " hit ctrl-z"
echo " % kill -9 %1"
echo
exit 0
echo "Examples:"
echo " Run all tests"
echo " % test_all_sandia"
echo ""
echo " Run all gcc tests"
echo " % test_all_sandia gcc"
echo ""
echo " Run all gcc/4.7.2 and all intel tests"
echo " % test_all_sandia gcc/4.7.2 intel"
echo ""
echo " Run all tests in debug"
echo " % test_all_sandia --debug"
echo ""
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
echo ""
echo "If you want to kill the tests, do:"
echo " hit ctrl-z"
echo " % kill -9 %1"
echo
exit 0
fi
# set build type
# Set build type.
if [ "$DEBUG" = "True" ]; then
BUILD_TYPE=debug
else
BUILD_TYPE=release
fi
# If no args provided, do all compilers
# If no args provided, do all compilers.
if [ -z "$ARGS" ]; then
ARGS='?'
fi
# Process args to figure out which compilers to test
# Process args to figure out which compilers to test.
COMPILERS_TO_TEST=""
for ARG in $ARGS; do
for COMPILER_DATA in "${COMPILERS[@]}"; do
ARR=($COMPILER_DATA)
COMPILER=${ARR[0]}
if [[ "$COMPILER" = $ARG* ]]; then
if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
@ -395,8 +400,35 @@ for ARG in $ARGS; do
done
done
# Check if Qthreads build requested.
HAVE_QTHREADS_BUILD="False"
if [ -n "$CUSTOM_BUILD_LIST" ]; then
if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
HAVE_QTHREADS_BUILD="True"
fi
else
for COMPILER_DATA in "${COMPILERS[@]}"; do
ARR=($COMPILER_DATA)
BUILD_LIST=${ARR[2]}
if [[ "$BUILD_LIST" = *Qthreads* ]]; then
HAVE_QTHREADS_BUILD="True"
fi
done
fi
# Ensure Qthreads path is set if Qthreads build is requested.
if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
if [ -z "$QTHREADS_PATH" ]; then
echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
exit 1
else
# Strip trailing slashes from path.
QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
fi
fi
#
# Functions
# Functions.
#
# get_compiler_name <COMPILER>
@ -409,7 +441,7 @@ get_compiler_version() {
echo $1 | cut -d/ -f2
}
# Do not call directly
# Do not call directly.
get_compiler_data() {
local compiler=$1
local item=$2
@ -419,13 +451,14 @@ get_compiler_data() {
local compiler_data
for compiler_data in "${COMPILERS[@]}" ; do
local arr=($compiler_data)
if [ "$compiler" = "${arr[0]}" ]; then
echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
return 0
fi
done
# Not found
# Not found.
echo "Unreconized compiler $compiler" >&2
exit 1
}
@ -459,14 +492,14 @@ run_cmd() {
# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
report_and_log_test_result() {
# Use sane var names
# Use sane var names.
local success=$1; local desc=$2; local comment=$3;
if [ "$success" = "0" ]; then
echo " PASSED $desc"
echo $comment > $PASSED_DIR/$desc
else
# For failures, comment should be the name of the phase that failed
# For failures, comment should be the name of the phase that failed.
echo " FAILED $desc" >&2
echo $comment > $FAILED_DIR/$desc
cat ${desc}.${comment}.log
@ -494,16 +527,16 @@ setup_env() {
# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
single_build_and_test() {
# Use sane var names
# Use sane var names.
local compiler=$1; local build=$2; local build_type=$3;
# set up env
# Set up env.
mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
cd $ROOT_DIR/$compiler/"${build}-$build_type"
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
# Set up flags
# Set up flags.
local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
local compiler_exe=$(get_compiler_exe_name $compiler)
@ -511,6 +544,14 @@ single_build_and_test() {
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
fi
if [[ "$build" = *Qthreads* ]]; then
if [[ "$build_type" = hwloc* ]]; then
local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
else
local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
fi
fi
if [[ "$OPT_FLAG" = "" ]]; then
OPT_FLAG="-O3"
fi
@ -522,11 +563,6 @@ single_build_and_test() {
local cxxflags="$OPT_FLAG $compiler_warning_flags"
fi
if [[ "$compiler" == cuda* ]]; then
cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
export TMPDIR=$(pwd)
fi
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
fi
@ -538,6 +574,7 @@ single_build_and_test() {
if [ "$TEST_SCRIPT" = "True" ]; then
local rand=$[ 1 + $[ RANDOM % 10 ]]
sleep $rand
if [ $rand -gt 5 ]; then
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
fi
@ -547,6 +584,7 @@ single_build_and_test() {
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)
comment="build_time=$(($build_end_time-$build_start_time))"
if [[ "$BUILD_ONLY" == False ]]; then
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
local -i run_end_time=$(date +%s)
@ -576,7 +614,7 @@ run_in_background() {
local compiler=$1
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
# don't override command line input
# Don't override command line input.
# if [[ "$BUILD_ONLY" == True ]]; then
# num_jobs=8
# else
@ -591,7 +629,7 @@ run_in_background() {
# build_and_test_all <COMPILER>
build_and_test_all() {
# Get compiler data
# Get compiler data.
local compiler=$1
if [ -z "$CUSTOM_BUILD_LIST" ]; then
local compiler_build_list=$(get_compiler_build_list $compiler)
@ -599,13 +637,13 @@ build_and_test_all() {
local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
fi
# do builds
# Do builds.
local build
for build in $compiler_build_list
do
run_in_background $compiler $build $BUILD_TYPE
# If not cuda, do a hwloc test too
# If not cuda, do a hwloc test too.
if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
fi
@ -655,7 +693,7 @@ wait_summarize_and_exit() {
}
#
# Main
# Main.
#
ROOT_DIR=$(get_test_root_dir)

View File

@ -60,7 +60,7 @@ class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
{
public:
typedef ViewTraits< DataType , P ... > traits ;
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
private:
@ -123,30 +123,41 @@ public:
enum { Rank = 1 };
KOKKOS_INLINE_FUNCTION constexpr size_t size() const
KOKKOS_INLINE_FUNCTION
size_t size() const noexcept
{
return
Kokkos::Impl::MemorySpaceAccess
uintptr_t n = 0 ;
if ( Kokkos::Impl::MemorySpaceAccess
< Kokkos::Impl::ActiveExecutionMemorySpace
, typename traits::memory_space
>::accessible
? // Runtime size is at the end of the chunk pointer array
(*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
<< m_chunk_shift
: 0 ;
>::accessible ) {
n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
else {
Kokkos::Impl::DeepCopy< Kokkos::HostSpace
, typename traits::memory_space
, Kokkos::HostSpace::execution_space >
( & n
, reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max )
, sizeof(uintptr_t) );
}
#endif
return n << m_chunk_shift ;
}
template< typename iType >
KOKKOS_INLINE_FUNCTION constexpr
KOKKOS_INLINE_FUNCTION
size_t extent( const iType & r ) const
{ return r == 0 ? size() : 1 ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION constexpr
KOKKOS_INLINE_FUNCTION
size_t extent_int( const iType & r ) const
{ return r == 0 ? size() : 1 ; }
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
@ -270,10 +281,18 @@ public:
}
/** \brief Resizing in serial can grow or shrink the array size, */
template< typename IntType >
inline
void resize_serial( size_t n )
typename std::enable_if
< std::is_integral<IntType>::value &&
Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
, typename traits::memory_space
>::accessible
>::type
resize_serial( IntType const & n )
{
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
typedef typename traits::value_type value_type ;
typedef value_type * pointer_type ;
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
@ -286,8 +305,8 @@ public:
if ( *pc < NC ) {
while ( *pc < NC ) {
m_chunks[*pc] =
m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
m_chunks[*pc] = reinterpret_cast<pointer_type>
( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
++*pc ;
}
}
@ -295,12 +314,90 @@ public:
while ( NC + 1 <= *pc ) {
--*pc ;
m_pool.deallocate( m_chunks[*pc]
, sizeof(traits::value_type) << m_chunk_shift );
, sizeof(value_type) << m_chunk_shift );
m_chunks[*pc] = 0 ;
}
}
}
//----------------------------------------
struct ResizeSerial {
memory_pool m_pool ;
typename traits::value_type ** m_chunks ;
uintptr_t * m_pc ;
uintptr_t m_nc ;
unsigned m_chunk_shift ;
KOKKOS_INLINE_FUNCTION
void operator()( int ) const
{
typedef typename traits::value_type value_type ;
typedef value_type * pointer_type ;
if ( *m_pc < m_nc ) {
while ( *m_pc < m_nc ) {
m_chunks[*m_pc] = reinterpret_cast<pointer_type>
( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
++*m_pc ;
}
}
else {
while ( m_nc + 1 <= *m_pc ) {
--*m_pc ;
m_pool.deallocate( m_chunks[*m_pc]
, sizeof(value_type) << m_chunk_shift );
m_chunks[*m_pc] = 0 ;
}
}
}
ResizeSerial( memory_pool const & arg_pool
, typename traits::value_type ** arg_chunks
, uintptr_t * arg_pc
, uintptr_t arg_nc
, unsigned arg_chunk_shift
)
: m_pool( arg_pool )
, m_chunks( arg_chunks )
, m_pc( arg_pc )
, m_nc( arg_nc )
, m_chunk_shift( arg_chunk_shift )
{}
};
template< typename IntType >
inline
typename std::enable_if
< std::is_integral<IntType>::value &&
! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
, typename traits::memory_space
>::accessible
>::type
resize_serial( IntType const & n )
{
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
if ( m_chunk_max < NC ) {
Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
}
// Must dispatch kernel
typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
uintptr_t * const pc =
reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
Kokkos::Impl::ParallelFor<ResizeSerial,Range>
closure( ResizeSerial( m_pool, m_chunks, pc, NC, m_chunk_shift )
, Range(0,1) );
closure.execute();
traits::execution_space::fence();
}
//----------------------------------------------------------------------
~DynamicView() = default ;
@ -311,15 +408,17 @@ public:
DynamicView & operator = ( const DynamicView & ) = default ;
template< class RT , class ... RP >
KOKKOS_INLINE_FUNCTION
DynamicView( const DynamicView<RT,RP...> & rhs )
: m_pool( rhs.m_pool )
, m_track( rhs.m_track )
, m_chunks( rhs.m_chunks )
, m_chunks( (typename traits::value_type **) rhs.m_chunks )
, m_chunk_shift( rhs.m_chunk_shift )
, m_chunk_mask( rhs.m_chunk_mask )
, m_chunk_max( rhs.m_chunk_max )
{
typedef typename DynamicView<RT,RP...>::traits SrcTraits ;
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
}
//----------------------------------------------------------------------
@ -400,8 +499,6 @@ public:
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
{
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
// A functor to deallocate all of the chunks upon final destruction
typedef typename traits::memory_space memory_space ;

View File

@ -230,16 +230,17 @@ public:
typedef typename Impl::remove_const<declared_value_type>::type value_type;
typedef typename Impl::add_const<value_type>::type const_value_type;
typedef Device execution_space;
typedef Device device_type;
typedef typename Device::execution_space execution_space;
typedef Hasher hasher_type;
typedef EqualTo equal_to_type;
typedef uint32_t size_type;
//map_types
typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type> insertable_map_type;
typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type> modifiable_map_type;
typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type> const_map_type;
typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type> insertable_map_type;
typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type> modifiable_map_type;
typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type> const_map_type;
static const bool is_set = std::is_same<void,value_type>::value;
static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
@ -264,18 +265,18 @@ private:
typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
typedef typename Impl::if_c< is_insertable_map
, View< key_type *, execution_space>
, View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
, View< key_type *, device_type>
, View< const key_type *, device_type, MemoryTraits<RandomAccess> >
>::type key_type_view;
typedef typename Impl::if_c< is_insertable_map || is_modifiable_map
, View< impl_value_type *, execution_space>
, View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
, View< impl_value_type *, device_type>
, View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
>::type value_type_view;
typedef typename Impl::if_c< is_insertable_map
, View< size_type *, execution_space>
, View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
, View< size_type *, device_type>
, View< const size_type *, device_type, MemoryTraits<RandomAccess> >
>::type size_type_view;
typedef typename Impl::if_c< is_insertable_map
@ -285,7 +286,7 @@ private:
enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
enum { num_scalars = 3 };
typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
public:
//! \name Public member functions
@ -757,7 +758,7 @@ public:
Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
@ -781,21 +782,21 @@ private: // private member functions
void set_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
const int true_ = true;
raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
}
void reset_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
const int false_ = false;
raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
}
bool get_flag(int flag) const
{
typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
int result = false;
raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
return result;

View File

@ -3,38 +3,49 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
SET(SOURCES
UnitTestMain.cpp
TestCuda.cpp
)
SET(LIBRARIES kokkoscore)
IF(Kokkos_ENABLE_Pthread)
LIST( APPEND SOURCES
TestThreads.cpp
)
ENDIF()
IF(Kokkos_ENABLE_Serial)
LIST( APPEND SOURCES
TestSerial.cpp
)
ENDIF()
IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES
TestOpenMP.cpp
)
ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest
SOURCES ${SOURCES}
UnitTest_Threads
SOURCES TestThreads.cpp UnitTestMain.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
ENDIF()
IF(Kokkos_ENABLE_Serial)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_Serial
SOURCES TestSerial.cpp UnitTestMain.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
ENDIF()
IF(Kokkos_ENABLE_OpenMP)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_OpenMP
SOURCES TestOpenMP.cpp UnitTestMain.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
ENDIF()
IF(Kokkos_ENABLE_Cuda)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_Cuda
SOURCES TestCuda.cpp UnitTestMain.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)
ENDIF()

View File

@ -64,6 +64,7 @@ struct TestDynamicView
typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
typedef typename view_type::const_type const_view_type ;
typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
typedef double value_type;
@ -136,6 +137,8 @@ struct TestDynamicView
view_type da("A",pool,arg_total_size);
const_view_type ca(da);
// printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);
TestDynamicView functor(da,arg_total_size);

View File

@ -1,5 +1,5 @@
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
TEST_OPTIONAL_TPLS CUSPARSE
)

View File

@ -30,7 +30,7 @@
#cmakedefine KOKKOS_HAVE_PTHREAD
#cmakedefine KOKKOS_HAVE_SERIAL
#cmakedefine KOKKOS_HAVE_QTHREAD
#cmakedefine KOKKOS_HAVE_QTHREADS
#cmakedefine KOKKOS_HAVE_Winthread
#cmakedefine KOKKOS_HAVE_OPENMP
#cmakedefine KOKKOS_HAVE_HWLOC

View File

@ -60,4 +60,3 @@ clean: kokkos-clean
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc

View File

@ -52,6 +52,8 @@
#include <impl/Kokkos_Timer.hpp>
#include <PerfTestMDRange.hpp>
#include <PerfTestHexGrad.hpp>
#include <PerfTestBlasKernels.hpp>
#include <PerfTestGramSchmidt.hpp>
@ -72,6 +74,14 @@ class cuda : public ::testing::Test {
}
};
//TEST_F( cuda, mdrange_lr ) {
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
//}
//TEST_F( cuda, mdrange_ll ) {
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
//}
TEST_F( cuda, hexgrad )
{
EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );

View File

@ -60,6 +60,342 @@ namespace Test {
enum { NUMBER_OF_TRIALS = 5 };
template< class DeviceType , class LayoutType >
void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0, int tile_offset = 0 )
// exp_beg = 6 => 2^6 = 64 is starting range length
{
#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
std::string label_mdrange ;
label_mdrange.append( "\"MDRange< double , " );
label_mdrange.append( deviceTypeName );
label_mdrange.append( " >\"" );
std::string label_range_col2 ;
label_range_col2.append( "\"RangeColTwo< double , " );
label_range_col2.append( deviceTypeName );
label_range_col2.append( " >\"" );
std::string label_range_col_all ;
label_range_col_all.append( "\"RangeColAll< double , " );
label_range_col_all.append( deviceTypeName );
label_range_col_all.append( " >\"" );
if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
std::cout << "--------------------------------------------------------------\n"
<< "Performance tests for MDRange Layout Right"
<< "\n--------------------------------------------------------------" << std::endl;
} else {
std::cout << "--------------------------------------------------------------\n"
<< "Performance tests for MDRange Layout Left"
<< "\n--------------------------------------------------------------" << std::endl;
}
for (int i = exp_beg ; i < exp_end ; ++i) {
const int range_length = (1<<i) + range_offset;
std::cout << "\n--------------------------------------------------------------\n"
<< "--------------------------------------------------------------\n"
<< "MDRange Test: range bounds: " << range_length << " , " << range_length << " , " << range_length
<< "\n--------------------------------------------------------------\n"
<< "--------------------------------------------------------------\n";
// << std::endl;
int t0_min = 0, t1_min = 0, t2_min = 0;
double seconds_min = 0.0;
// Test 1: The MDRange in full
{
int t0 = 1, t1 = 1, t2 = 1;
int counter = 1;
#if !defined(KOKKOS_HAVE_CUDA)
int min_bnd = 8;
int tfast = range_length;
#else
int min_bnd = 2;
int tfast = 32;
#endif
while ( tfast >= min_bnd ) {
int tmid = min_bnd;
while ( tmid < tfast ) {
t0 = min_bnd;
t1 = tmid;
t2 = tfast;
int t2_rev = min_bnd;
int t1_rev = tmid;
int t0_rev = tfast;
#if defined(KOKKOS_HAVE_CUDA)
//Note: Product of tile sizes must be < 1024 for Cuda
if ( t0*t1*t2 >= 1024 ) {
printf(" Exceeded Cuda tile limits; onto next range set\n\n");
break;
}
#endif
// Run 1 with tiles LayoutRight style
double seconds_1 = 0;
{ seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
std::cout << label_mdrange
<< " , " << t0 << " , " << t1 << " , " << t2
<< " , " << seconds_1
<< std::endl ;
#endif
if ( counter == 1 ) {
seconds_min = seconds_1;
t0_min = t0;
t1_min = t1;
t2_min = t2;
}
else {
if ( seconds_1 < seconds_min )
{
seconds_min = seconds_1;
t0_min = t0;
t1_min = t1;
t2_min = t2;
}
}
// Run 2 with tiles LayoutLeft style - reverse order of tile dims
double seconds_1rev = 0;
{ seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
std::cout << label_mdrange
<< " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
<< " , " << seconds_1rev
<< std::endl ;
#endif
if ( seconds_1rev < seconds_min )
{
seconds_min = seconds_1rev;
t0_min = t0_rev;
t1_min = t1_rev;
t2_min = t2_rev;
}
++counter;
tmid <<= 1;
} //end inner while
tfast >>=1;
} //end outer while
std::cout << "\n"
<< "--------------------------------------------------------------\n"
<< label_mdrange
<< "\n Min values "
<< "\n Range length per dim (3D): " << range_length
<< "\n TileDims: " << t0_min << " , " << t1_min << " , " << t2_min
<< "\n Min time: " << seconds_min
<< "\n---------------------------------------------------------------"
<< std::endl ;
} //end scope
#if !defined(KOKKOS_HAVE_CUDA)
double seconds_min_c = 0.0;
int t0c_min = 0, t1c_min = 0, t2c_min = 0;
int counter = 1;
{
int min_bnd = 8;
// Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
double seconds_c = 0;
{ seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
std::cout << " MDRange LR with '0' tile - collapse-like \n"
<< label_mdrange
<< " , " << T0 << " , " << T1 << " , " << range_length
<< " , " << seconds_c
<< std::endl ;
#endif
t2c_min = range_length;
if ( counter == 1 ) {
seconds_min_c = seconds_c;
t0c_min = T0;
t1c_min = T1;
}
else {
if ( seconds_c < seconds_min_c )
{
seconds_min_c = seconds_c;
t0c_min = T0;
t1c_min = T1;
}
}
++counter;
}
}
}
else {
for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
double seconds_c = 0;
{ seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
std::cout << " MDRange LL with '0' tile - collapse-like \n"
<< label_mdrange
<< " , " <<range_length << " < " << T1 << " , " << T2
<< " , " << seconds_c
<< std::endl ;
#endif
t0c_min = range_length;
if ( counter == 1 ) {
seconds_min_c = seconds_c;
t1c_min = T1;
t2c_min = T2;
}
else {
if ( seconds_c < seconds_min_c )
{
seconds_min_c = seconds_c;
t1c_min = T1;
t2c_min = T2;
}
}
++counter;
}
}
}
std::cout
// << "--------------------------------------------------------------\n"
<< label_mdrange
<< " Collapse<2> style: "
<< "\n Min values "
<< "\n Range length per dim (3D): " << range_length
<< "\n TileDims: " << t0c_min << " , " << t1c_min << " , " << t2c_min
<< "\n Min time: " << seconds_min_c
<< "\n---------------------------------------------------------------"
<< std::endl ;
} //end scope test 2
#endif
// Test 2: RangePolicy Collapse2 style
double seconds_2 = 0;
{ seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
std::cout << label_range_col2
<< " , " << range_length
<< " , " << seconds_2
<< std::endl ;
// Test 3: RangePolicy Collapse all style - not necessary, always slow
/*
double seconds_3 = 0;
{ seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
std::cout << label_range_col_all
<< " , " << range_length
<< " , " << seconds_3
<< "\n---------------------------------------------------------------"
<< std::endl ;
*/
// Compare fastest times... will never be collapse all so ignore it
// seconds_min = tiled MDRange
// seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
// seconds_2 = collapse<2>-style RangePolicy
// seconds_3 = collapse<3>-style RangePolicy
#if !defined(KOKKOS_HAVE_CUDA)
if ( seconds_min < seconds_min_c ) {
if ( seconds_min < seconds_2 ) {
std::cout << "--------------------------------------------------------------\n"
<< " Fastest run: MDRange tiled\n"
<< " Time: " << seconds_min
<< " Difference: " << seconds_2 - seconds_min
<< " Other times: \n"
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
else if ( seconds_min > seconds_2 ) {
std::cout << " Fastest run: Collapse2 RangePolicy\n"
<< " Time: " << seconds_2
<< " Difference: " << seconds_min - seconds_2
<< " Other times: \n"
<< " MDrange Tiled: " << seconds_min << "\n"
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
}
else if ( seconds_min > seconds_min_c ) {
if ( seconds_min_c < seconds_2 ) {
std::cout << "--------------------------------------------------------------\n"
<< " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
<< " Time: " << seconds_min_c
<< " Difference: " << seconds_2 - seconds_min_c
<< " Other times: \n"
<< " MDrange Tiled: " << seconds_min << "\n"
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
else if ( seconds_min_c > seconds_2 ) {
std::cout << " Fastest run: Collapse2 RangePolicy\n"
<< " Time: " << seconds_2
<< " Difference: " << seconds_min_c - seconds_2
<< " Other times: \n"
<< " MDrange Tiled: " << seconds_min << "\n"
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
} // end else if
#else
if ( seconds_min < seconds_2 ) {
std::cout << "--------------------------------------------------------------\n"
<< " Fastest run: MDRange tiled\n"
<< " Time: " << seconds_min
<< " Difference: " << seconds_2 - seconds_min
<< " Other times: \n"
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
else if ( seconds_min > seconds_2 ) {
std::cout << " Fastest run: Collapse2 RangePolicy\n"
<< " Time: " << seconds_2
<< " Difference: " << seconds_min - seconds_2
<< " Other times: \n"
<< " MDrange Tiled: " << seconds_min << "\n"
<< "\n--------------------------------------------------------------"
<< "\n--------------------------------------------------------------"
//<< "\n\n"
<< std::endl;
}
#endif
} //end for
#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
}
template< class DeviceType >

View File

@ -66,6 +66,8 @@ const char TestHostDeviceName[] = "Kokkos::Serial" ;
#include <impl/Kokkos_Timer.hpp>
#include <PerfTestMDRange.hpp>
#include <PerfTestHexGrad.hpp>
#include <PerfTestBlasKernels.hpp>
#include <PerfTestGramSchmidt.hpp>
@ -102,6 +104,14 @@ protected:
}
};
//TEST_F( host, mdrange_lr ) {
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
//}
//TEST_F( host, mdrange_ll ) {
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
//}
TEST_F( host, hexgrad ) {
EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
}

View File

@ -0,0 +1,564 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
namespace Test {
template< class DeviceType
, typename ScalarType = double
, typename TestLayout = Kokkos::LayoutRight
>
struct MultiDimRangePerf3D
{
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
using iterate_type = Kokkos::Experimental::Iterate;
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
typedef typename view_type::HostMirror host_view_type;
view_type A;
view_type B;
const long irange;
const long jrange;
const long krange;
MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
: A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const long i, const long j, const long k) const
{
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+ B(i,j+2,k) + B(i,j+1,k)
+ B(i,j,k+2) + B(i,j,k+1)
+ B(i,j,k) );
}
struct InitZeroTag {};
// struct InitViewTag {};
struct Init
{
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const long i, const long j, const long k) const
{
input(i,j,k) = 1.0;
}
KOKKOS_INLINE_FUNCTION
void operator()(const InitZeroTag&, const long i, const long j, const long k) const
{
input(i,j,k) = 0;
}
view_type input;
const long irange;
const long jrange;
const long krange;
};
static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
{
//This test performs multidim range over all dims
view_type Atest("Atest", icount, jcount, kcount);
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
double dt_min = 0;
// LayoutRight
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}});
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}});
typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
using tile_type = typename MDRangeType::tile_type;
using point_type = typename MDRangeType::point_type;
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
execution_space::fence();
Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
execution_space::fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
execution_space::fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
//Correctness check - only the first run
if ( 0 == i )
{
long numErrors = 0;
host_view_type Ahost("Ahost", icount, jcount, kcount);
Kokkos::deep_copy(Ahost, Atest);
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
Kokkos::deep_copy(Bhost, Btest);
// On KNL, this may vectorize - add print statement to prevent
// Also, compare against epsilon, as vectorization can change bitwise answer
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
+ Bhost(l,j,k) );
if ( Ahost(l,j,k) - check != 0 ) {
++numErrors;
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
<< " multi Ahost = " << Ahost(l,j,k) << " expected = " << check
<< " multi Bhost(ijk) = " << Bhost(l,j,k)
<< " multi Bhost(l+1jk) = " << Bhost(l+1,j,k)
<< " multi Bhost(l+2jk) = " << Bhost(l+2,j,k)
<< " multi Bhost(ij+1k) = " << Bhost(l,j+1,k)
<< " multi Bhost(ij+2k) = " << Bhost(l,j+2,k)
<< " multi Bhost(ijk+1) = " << Bhost(l,j,k+1)
<< " multi Bhost(ijk+2) = " << Bhost(l,j,k+2)
<< std::endl;
//exit(-1);
}
} } }
if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
//else { std::cout << " multi: No errors!" << std::endl; }
}
} //end for
}
// LayoutLeft
else {
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}});
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}});
//typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
//using tile_type = typename MDRangeType::tile_type;
//using point_type = typename MDRangeType::point_type;
//Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} );
Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
execution_space::fence();
Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
execution_space::fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
execution_space::fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
//Correctness check - only the first run
if ( 0 == i )
{
long numErrors = 0;
host_view_type Ahost("Ahost", icount, jcount, kcount);
Kokkos::deep_copy(Ahost, Atest);
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
Kokkos::deep_copy(Bhost, Btest);
// On KNL, this may vectorize - add print statement to prevent
// Also, compare against epsilon, as vectorization can change bitwise answer
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
+ Bhost(l,j,k) );
if ( Ahost(l,j,k) - check != 0 ) {
++numErrors;
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
<< " multi Ahost = " << Ahost(l,j,k) << " expected = " << check
<< " multi Bhost(ijk) = " << Bhost(l,j,k)
<< " multi Bhost(l+1jk) = " << Bhost(l+1,j,k)
<< " multi Bhost(l+2jk) = " << Bhost(l+2,j,k)
<< " multi Bhost(ij+1k) = " << Bhost(l,j+1,k)
<< " multi Bhost(ij+2k) = " << Bhost(l,j+2,k)
<< " multi Bhost(ijk+1) = " << Bhost(l,j,k+1)
<< " multi Bhost(ijk+2) = " << Bhost(l,j,k+2)
<< std::endl;
//exit(-1);
}
} } }
if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
//else { std::cout << " multi: No errors!" << std::endl; }
}
} //end for
}
return dt_min;
}
};
template< class DeviceType
, typename ScalarType = double
, typename TestLayout = Kokkos::LayoutRight
>
struct RangePolicyCollapseTwo
{
// RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
typedef TestLayout layout;
using iterate_type = Kokkos::Experimental::Iterate;
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
typedef typename view_type::HostMirror host_view_type;
view_type A;
view_type B;
const long irange;
const long jrange;
const long krange;
RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
: A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const long r) const
{
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
{
//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
//r = j + i*Nj
long i = int(r / jrange);
long j = int( r - i*jrange);
for (int k = 0; k < krange; ++k) {
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+ B(i,j+2,k) + B(i,j+1,k)
+ B(i,j,k+2) + B(i,j,k+1)
+ B(i,j,k) );
}
}
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
{
//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
//r = j + k*Nj
long k = int(r / jrange);
long j = int( r - k*jrange);
for (int i = 0; i < irange; ++i) {
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+ B(i,j+2,k) + B(i,j+1,k)
+ B(i,j,k+2) + B(i,j,k+1)
+ B(i,j,k) );
}
}
}
struct Init
{
view_type input;
const long irange;
const long jrange;
const long krange;
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const long r) const
{
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
{
long i = int(r / jrange);
long j = int( r - i*jrange);
for (int k = 0; k < krange; ++k) {
input(i,j,k) = 1;
}
}
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
{
long k = int(r / jrange);
long j = int( r - k*jrange);
for (int i = 0; i < irange; ++i) {
input(i,j,k) = 1;
}
}
}
};
static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
{
// This test refers to collapsing two dims while using the RangePolicy
view_type Atest("Atest", icount, jcount, kcount);
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
long collapse_index_rangeA = 0;
long collapse_index_rangeB = 0;
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
collapse_index_rangeA = icount*jcount;
collapse_index_rangeB = (icount+2)*(jcount+2);
// std::cout << " LayoutRight " << std::endl;
} else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
collapse_index_rangeA = kcount*jcount;
collapse_index_rangeB = (kcount+2)*(jcount+2);
// std::cout << " LayoutLeft " << std::endl;
} else {
std::cout << " LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
exit(-1);
}
Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
double dt_min = 0;
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
execution_space::fence();
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
execution_space::fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
execution_space::fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
//Correctness check - first iteration only
if ( 0 == i )
{
long numErrors = 0;
host_view_type Ahost("Ahost", icount, jcount, kcount);
Kokkos::deep_copy(Ahost, Atest);
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
Kokkos::deep_copy(Bhost, Btest);
// On KNL, this may vectorize - add print statement to prevent
// Also, compare against epsilon, as vectorization can change bitwise answer
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
+ Bhost(l,j,k) );
if ( Ahost(l,j,k) - check != 0 ) {
++numErrors;
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
<< " flat Ahost = " << Ahost(l,j,k) << " expected = " << check << std::endl;
//exit(-1);
}
} } }
if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
//else { std::cout << " RP collapse2: Pass! " << std::endl; }
}
}
return dt_min;
}
};
template< class DeviceType
, typename ScalarType = double
, typename TestLayout = Kokkos::LayoutRight
>
struct RangePolicyCollapseAll
{
// RangePolicy for 3D range, but will collapse all dims
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
typedef TestLayout layout;
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
typedef typename view_type::HostMirror host_view_type;
view_type A;
view_type B;
const long irange;
const long jrange;
const long krange;
RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
: A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const long r) const
{
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
{
long i = int(r / (jrange*krange));
long j = int(( r - i*jrange*krange)/krange);
long k = int(r - i*jrange*krange - j*krange);
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+ B(i,j+2,k) + B(i,j+1,k)
+ B(i,j,k+2) + B(i,j,k+1)
+ B(i,j,k) );
}
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
{
long k = int(r / (irange*jrange));
long j = int(( r - k*irange*jrange)/irange);
long i = int(r - k*irange*jrange - j*irange);
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
+ B(i,j+2,k) + B(i,j+1,k)
+ B(i,j,k+2) + B(i,j,k+1)
+ B(i,j,k) );
}
}
struct Init
{
view_type input;
const long irange;
const long jrange;
const long krange;
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const long r) const
{
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
{
long i = int(r / (jrange*krange));
long j = int(( r - i*jrange*krange)/krange);
long k = int(r - i*jrange*krange - j*krange);
input(i,j,k) = 1;
}
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
{
long k = int(r / (irange*jrange));
long j = int(( r - k*irange*jrange)/irange);
long i = int(r - k*irange*jrange - j*irange);
input(i,j,k) = 1;
}
}
};
static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
{
//This test refers to collapsing all dims using the RangePolicy
view_type Atest("Atest", icount, jcount, kcount);
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
const long flat_index_range = icount*jcount*kcount;
Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
double dt_min = 0;
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
execution_space::fence();
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
execution_space::fence();
for (int i = 0; i < iter; ++i)
{
Kokkos::Timer timer;
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
execution_space::fence();
const double dt = timer.seconds();
if ( 0 == i ) dt_min = dt ;
else dt_min = dt < dt_min ? dt : dt_min ;
//Correctness check - first iteration only
if ( 0 == i )
{
long numErrors = 0;
host_view_type Ahost("Ahost", icount, jcount, kcount);
Kokkos::deep_copy(Ahost, Atest);
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
Kokkos::deep_copy(Bhost, Btest);
// On KNL, this may vectorize - add print statement to prevent
// Also, compare against epsilon, as vectorization can change bitwise answer
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
+ Bhost(l,j,k) );
if ( Ahost(l,j,k) - check != 0 ) {
++numErrors;
std::cout << " Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
<< " flat Ahost = " << Ahost(l,j,k) << " expected = " << check << std::endl;
//exit(-1);
}
} } }
if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
//else { std::cout << " RP collapse all: Pass! " << std::endl; }
}
}
return dt_min;
}
};
} //end namespace Test

View File

@ -92,13 +92,13 @@ LIST(APPEND SOURCES ${SOURCES_CUDA} )
INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
#-----------------------------------------------------------------------------
FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
LIST(APPEND SOURCES ${SOURCES_QTHREAD} )
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
LIST(APPEND SOURCES ${SOURCES_QTHREADS} )
INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)
#-----------------------------------------------------------------------------
@ -109,5 +109,3 @@ TRIBITS_ADD_LIBRARY(
SOURCES ${SOURCES}
DEPLIBS
)

File diff suppressed because it is too large Load Diff

View File

@ -131,6 +131,7 @@ namespace Impl {
int* atomic;
int* scratch;
int* threadid;
int n;
};
}
}
@ -250,6 +251,7 @@ struct CudaParallelLaunch< DriverType , true > {
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
@ -292,6 +294,7 @@ struct CudaParallelLaunch< DriverType , false > {
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif

View File

@ -59,7 +59,7 @@
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
@ -375,7 +375,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::CudaSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
SharedAllocationHeader header ;
@ -395,7 +395,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::fence(); //Make sure I can access the label ...
Kokkos::Profiling::deallocateData(
@ -412,7 +412,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -442,7 +442,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
, m_tex_obj( 0 )
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
@ -479,7 +479,7 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
, m_tex_obj( 0 )
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
@ -510,7 +510,7 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}
@ -883,6 +883,7 @@ void init_lock_arrays_cuda_space() {
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());

View File

@ -536,6 +536,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
locks.n = Kokkos::Cuda::concurrency();
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif
}
@ -620,9 +621,9 @@ void CudaInternal::finalize()
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
atomic_lock_array_cuda_space_ptr(false);
scratch_lock_array_cuda_space_ptr(false);
threadid_lock_array_cuda_space_ptr(false);
atomic_lock_array_cuda_space_ptr(true);
scratch_lock_array_cuda_space_ptr(true);
threadid_lock_array_cuda_space_ptr(true);
if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
@ -700,7 +701,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
{
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
@ -739,7 +740,7 @@ void Cuda::finalize()
{
Impl::CudaInternal::singleton().finalize();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}

View File

@ -61,7 +61,7 @@
#include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <Kokkos_Vectorization.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
@ -586,13 +586,35 @@ public:
void operator()(void) const
{
// Iterate this block through the league
int threadid = 0;
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
}
__syncthreads();
threadid = base_thread_id;
}
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
this-> template exec_team< WorkTag >(
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
, m_shmem_begin
, m_shmem_size
, m_scratch_ptr[1]
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size ) );
@ -946,11 +968,32 @@ public:
__device__ inline
void operator() () const {
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
int threadid = 0;
if ( m_scratch_size[1]>0 ) {
__shared__ int base_thread_id;
if (threadIdx.x==0 && threadIdx.y==0 ) {
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
int done = 0;
while (!done) {
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
if(!done) {
threadid += blockDim.x * blockDim.y;
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
}
}
base_thread_id = threadid;
}
__syncthreads();
threadid = base_thread_id;
}
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
}
__device__ inline
void run(const DummySHMEMReductionType&) const
void run(const DummySHMEMReductionType&, const int& threadid) const
{
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
@ -964,7 +1007,7 @@ public:
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, m_scratch_ptr[1]
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
@ -992,7 +1035,7 @@ public:
}
__device__ inline
void run(const DummyShflReductionType&) const
void run(const DummyShflReductionType&, const int& threadid) const
{
value_type value;
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
@ -1003,7 +1046,7 @@ public:
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
, m_shmem_begin
, m_shmem_size
, m_scratch_ptr[1]
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
, m_scratch_size[1]
, league_rank
, m_league_size )
@ -1128,9 +1171,9 @@ public:
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
}
if ( m_team_size >
Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
if ( unsigned(m_team_size) >
unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
}
@ -1621,14 +1664,25 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Cuda
#endif
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
/** \brief Intra-thread vector parallel_reduce.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
* Calls lambda(iType i, ValueType & val) for each i=[0..N).
*
* The range [0..N) is mapped to all vector lanes of
* the calling thread and a reduction of val is performed using +=
* and output into result.
*
* The identity value for the += operator is assumed to be the default
* constructed value.
*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
void parallel_reduce
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
const & loop_boundaries
, Lambda const & lambda
, ValueType & result )
{
#ifdef __CUDA_ARCH__
result = ValueType();
@ -1636,52 +1690,42 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::C
lambda(i,result);
}
if (loop_boundaries.increment > 1)
result += shfl_down(result, 1,loop_boundaries.increment);
if (loop_boundaries.increment > 2)
result += shfl_down(result, 2,loop_boundaries.increment);
if (loop_boundaries.increment > 4)
result += shfl_down(result, 4,loop_boundaries.increment);
if (loop_boundaries.increment > 8)
result += shfl_down(result, 8,loop_boundaries.increment);
if (loop_boundaries.increment > 16)
result += shfl_down(result, 16,loop_boundaries.increment);
Impl::cuda_intra_warp_vector_reduce(
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & result ) );
result = shfl(result,0,loop_boundaries.increment);
#endif
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
/** \brief Intra-thread vector parallel_reduce.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
* Calls lambda(iType i, ValueType & val) for each i=[0..N).
*
* The range [0..N) is mapped to all vector lanes of
* the calling thread and a reduction of val is performed
* using JoinType::operator()(ValueType& val, const ValueType& update)
* and output into result.
*
* The input value of result must be the identity value for the
* reduction operation; e.g., ( 0 , += ) or ( 1 , *= ).
*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
void parallel_reduce
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
const & loop_boundaries
, Lambda const & lambda
, JoinType const & join
, ValueType & result )
{
#ifdef __CUDA_ARCH__
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
if (loop_boundaries.increment > 1)
join( result, shfl_down(result, 1,loop_boundaries.increment));
if (loop_boundaries.increment > 2)
join( result, shfl_down(result, 2,loop_boundaries.increment));
if (loop_boundaries.increment > 4)
join( result, shfl_down(result, 4,loop_boundaries.increment));
if (loop_boundaries.increment > 8)
join( result, shfl_down(result, 8,loop_boundaries.increment));
if (loop_boundaries.increment > 16)
join( result, shfl_down(result, 16,loop_boundaries.increment));
Impl::cuda_intra_warp_vector_reduce(
Impl::Reducer< ValueType , JoinType >( join , & result ) );
init_result = shfl(result,0,loop_boundaries.increment);
#endif
}

View File

@ -55,15 +55,163 @@
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< typename T >
__device__ inline
void cuda_shfl( T & out , T const & in , int lane ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl( *reinterpret_cast<int const *>(&in) , lane , width );
}
//Shfl based reductions
template< typename T >
__device__ inline
void cuda_shfl( T & out , T const & in , int lane ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
}
}
//----------------------------------------------------------------------------
template< typename T >
__device__ inline
void cuda_shfl_down( T & out , T const & in , int delta ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
}
template< typename T >
__device__ inline
void cuda_shfl_down( T & out , T const & in , int delta ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
}
}
//----------------------------------------------------------------------------
template< typename T >
__device__ inline
void cuda_shfl_up( T & out , T const & in , int delta ,
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
{
*reinterpret_cast<int*>(&out) =
__shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
}
template< typename T >
__device__ inline
void cuda_shfl_up( T & out , T const & in , int delta ,
typename std::enable_if
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
, int >::type width )
{
enum : int { N = sizeof(T) / sizeof(int) };
for ( int i = 0 ; i < N ; ++i ) {
reinterpret_cast<int*>(&out)[i] =
__shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
}
}
//----------------------------------------------------------------------------
/** \brief Reduce within a warp over blockDim.x, the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Requires:
* blockDim.x is power of 2
* blockDim.x <= 32 (one warp)
*
* Cannot use "butterfly" pattern because floating point
* addition is non-associative. Therefore, must broadcast
* the final result.
*/
template< class Reducer >
__device__ inline
void cuda_intra_warp_vector_reduce( Reducer const & reducer )
{
static_assert(
std::is_reference< typename Reducer::reference_type >::value , "" );
if ( 1 < blockDim.x ) {
typename Reducer::value_type tmp ;
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
}
// Broadcast from root "lane" to all other "lanes"
cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
}
}
/** \brief Inclusive scan over blockDim.x, the "vector" dimension.
*
* This will be called within a nested, intra-team parallel operation.
* Use shuffle operations to avoid conflicts with shared memory usage.
*
* Algorithm is concurrent bottom-up reductions in triangular pattern
* where each CUDA thread is the root of a reduction tree from the
* zeroth CUDA thread to itself.
*
* Requires:
* blockDim.x is power of 2
* blockDim.x <= 32 (one warp)
*/
template< typename ValueType >
__device__ inline
void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
{
ValueType tmp ;
// Bottom up:
// [t] += [t-1] if t >= 1
// [t] += [t-2] if t >= 2
// [t] += [t-4] if t >= 4
// ...
for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
cuda_shfl_up( tmp , local , i , blockDim.x );
if ( i <= threadIdx.x ) { local += tmp ; }
}
}
//----------------------------------------------------------------------------
/*
* Algorithmic constraints:
* (a) threads with same threadIdx.y have same value
@ -98,7 +246,10 @@ inline void cuda_inter_warp_reduction( ValueType& value,
const int max_active_thread = blockDim.y) {
#define STEP_WIDTH 4
__shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
// The reason not to use ValueType directly is that for types with constructors it
// could lead to race conditions
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
ValueType* result = (ValueType*) & sh_result;
const unsigned step = 32 / blockDim.x;
unsigned shift = STEP_WIDTH;

View File

@ -91,7 +91,7 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
}
}

View File

@ -61,6 +61,8 @@ void set_cuda_task_base_apply_function_pointer
}
template< class > class TaskExec ;
template<>
class TaskQueueSpecialization< Kokkos::Cuda >
{
@ -69,6 +71,7 @@ public:
using execution_space = Kokkos::Cuda ;
using memory_space = Kokkos::CudaUVMSpace ;
using queue_type = TaskQueue< execution_space > ;
using member_type = TaskExec< Kokkos::Cuda > ;
static
void iff_single_thread_recursive_execute( queue_type * const ) {}
@ -79,13 +82,15 @@ public:
static
void execute( queue_type * const );
template< typename FunctorType >
template< typename TaskType >
static
void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
typename TaskType::function_type
get_function_pointer()
{
using TaskType = TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
using function_type = typename TaskType::function_type ;
function_type * const ptr =
(function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
@ -93,6 +98,8 @@ public:
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
return *ptr ;
}
};
@ -435,18 +442,26 @@ void parallel_reduce
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename ValueType, typename iType, class Lambda >
template< typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda) {
const Closure & closure )
{
// Extract value_type from closure
ValueType accum = 0 ;
ValueType val, y, local_total;
using value_type =
typename Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::SCAN
, void
, Closure >::value_type ;
value_type accum = 0 ;
value_type val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
closure(i,val,false);
// intra-blockDim.y exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
@ -458,7 +473,7 @@ void parallel_scan
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val,
local_total = shfl_warp_broadcast<value_type>(val,
threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
Impl::CudaTraits::WarpSize);
@ -467,7 +482,7 @@ void parallel_scan
if ( threadIdx.y == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
closure(i,val,true);
accum += local_total;
}
}
@ -478,18 +493,26 @@ void parallel_scan
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
template< typename iType, class Closure >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda)
const Closure & closure )
{
ValueType accum = 0 ;
ValueType val, y, local_total;
// Extract value_type from closure
using value_type =
typename Kokkos::Impl::FunctorAnalysis
< Kokkos::Impl::FunctorPatternInterface::SCAN
, void
, Closure >::value_type ;
value_type accum = 0 ;
value_type val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
closure(i,val,false);
// intra-blockDim.x exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
@ -501,14 +524,14 @@ void parallel_scan
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);
// make EXCLUSIVE scan by shifting values over one
val = Kokkos::shfl_up(val, 1, blockDim.x);
if ( threadIdx.x == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
closure(i,val,true);
accum += local_total;
}
}

View File

@ -44,36 +44,47 @@
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#include <initializer_list>
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
#define KOKKOS_IMPL_MDRANGE_IVDEP
#include<impl/KokkosExp_Host_IterateTile.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
#endif
namespace Kokkos { namespace Experimental {
// ------------------------------------------------------------------ //
enum class Iterate
{
Default, // Default for the device
Left, // Left indices stride fastest
Right, // Right indices stride fastest
Flat, // Do not tile, only valid for inner direction
};
template <typename ExecSpace>
struct default_outer_direction
{
using type = Iterate;
#if defined( KOKKOS_ENABLE_CUDA)
static constexpr Iterate value = Iterate::Left;
#else
static constexpr Iterate value = Iterate::Right;
#endif
};
template <typename ExecSpace>
struct default_inner_direction
{
using type = Iterate;
#if defined( KOKKOS_ENABLE_CUDA)
static constexpr Iterate value = Iterate::Left;
#else
static constexpr Iterate value = Iterate::Right;
#endif
};
@ -86,7 +97,7 @@ struct Rank
{
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
@ -96,498 +107,236 @@ struct Rank
};
// multi-dimensional iteration pattern
template <typename... Properties>
struct MDRangePolicy
: public Kokkos::Impl::PolicyTraits<Properties ...>
{
using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
using range_policy = RangePolicy<Properties...>;
static_assert( !std::is_same<range_policy,void>::value
using impl_range_policy = RangePolicy< typename traits::execution_space
, typename traits::schedule_type
, typename traits::index_type
> ;
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
using iteration_pattern = typename range_policy::iteration_pattern;
using work_tag = typename range_policy::work_tag;
using iteration_pattern = typename traits::iteration_pattern;
using work_tag = typename traits::work_tag;
static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
(iteration_pattern::outer_direction != Iterate::Default)
? iteration_pattern::outer_direction
: default_outer_direction< typename range_policy::execution_space>::value );
: default_outer_direction< typename traits::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename range_policy::execution_space>::value ) ;
: default_inner_direction< typename traits::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Flat = static_cast<int>( Iterate::Flat );
static constexpr int Right = static_cast<int>( Iterate::Right );
static constexpr int Left = static_cast<int>( Iterate::Left );
using index_type = typename traits::index_type;
using array_index_type = long;
using point_type = Kokkos::Array<array_index_type,rank>; //was index_type
using tile_type = Kokkos::Array<array_index_type,rank>;
// If point_type or tile_type is not templated on a signed integral type (if it is unsigned),
// then if user passes in intializer_list of runtime-determined values of
// signed integral type that are not const will receive a compiler error due
// to an invalid case for implicit conversion -
// "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
// This would require the user to either pass a matching index_type parameter
// as template parameter to the MDRangePolicy or static_cast the individual values
using size_type = typename range_policy::index_type;
using index_type = typename std::make_signed<size_type>::type;
template <typename I>
MDRangePolicy( std::initializer_list<I> upper_corner )
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
: m_lower(lower)
, m_upper(upper)
, m_tile(tile)
, m_num_tiles(1)
{
static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
const auto u = upper_corner.begin();
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(0);
m_dim[i] = static_cast<index_type>(u[i]);
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
// Host
if ( true
#if defined(KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
#endif
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
index_type span;
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
if ( m_tile[i] <= 0 ) {
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = span;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
else // Cuda
{
index_type span;
for (int i=0; i<rank; ++i) {
span = upper[i] - lower[i];
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = 16;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
}
index_type total_tile_size_check = 1;
for (int i=0; i<rank; ++i) {
total_tile_size_check *= m_tile[i];
}
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
}
}
#endif
}
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
template < typename LT , typename UT , typename TT = array_index_type >
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
{
#if 0
// This should work, less duplicated code but not yet extensively tested
point_type lower_tmp, upper_tmp;
tile_type tile_tmp;
for ( auto i = 0; i < rank; ++i ) {
lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]);
}
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
#else
if(m_lower.size()!=rank || m_upper.size() != rank)
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
for ( auto i = 0; i < rank; ++i ) {
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
if(tile.size()==rank)
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
else
m_tile[i] = 0;
}
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB, typename T>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
, std::initializer_list<T> tile
// Host
if ( true
#if defined(KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
#endif
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
//static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
const auto t = tile.begin();
m_num_tiles = 1;
index_type span;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = span;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
}
}
#if defined(KOKKOS_ENABLE_CUDA)
else // Cuda
{
index_type span;
for (int i=0; i<rank; ++i) {
span = m_upper[i] - m_lower[i];
if ( m_tile[i] <= 0 ) {
// TODO: determine what is a good default tile size for cuda
// may be rank dependent
if ( (inner_direction == Right && (i < rank-1))
|| (inner_direction == Left && (i > 0)) )
{
m_tile[i] = 2;
}
else {
m_tile[i] = 16;
}
}
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
m_num_tiles *= m_tile_end[i];
}
index_type total_tile_size_check = 1;
for (int i=0; i<rank; ++i) {
total_tile_size_check *= m_tile[i];
}
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
printf(" Tile dimensions exceed Cuda limits\n");
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
}
}
#endif
#endif
}
index_type m_offset[rank];
index_type m_dim[rank];
int m_tile[rank];
index_type m_tile_dim[rank];
size_type m_num_tiles; // product of tile dims
point_type m_lower;
point_type m_upper;
tile_type m_tile;
point_type m_tile_end;
index_type m_num_tiles;
};
// ------------------------------------------------------------------ //
namespace Impl {
// Serial, Threads, OpenMP
// use enable_if to overload for Cuda
template < typename MDRange, typename Functor, typename Enable = void >
struct MDForFunctor
{
using work_tag = typename MDRange::work_tag;
using index_type = typename MDRange::index_type;
using size_type = typename MDRange::size_type;
MDRange m_range;
Functor m_func;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor const& f )
: m_range(range)
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor && f )
: m_range(range)
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor const& f )
: m_range( std::forward<MDRange>(range) )
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor && f )
: m_range( std::forward<MDRange>(range) )
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor && ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor && ) = default;
// Rank-2, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1 );
}}
}
}
// Rank-2, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( tag, i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1 );
}}
}
}
//---------------------------------------------------------------------------
// Rank-3, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( work_tag{}
, m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( work_tag{}
, m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1, i2 );
}}}
}
}
// Rank-3, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( tag, i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1, i2 );
}}}
}
}
};
} // namespace Impl
template <typename MDRange, typename Functor>
// ------------------------------------------------------------------ //
//md_parallel_for
// ------------------------------------------------------------------ //
template <typename MDRange, typename Functor, typename Enable = void>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::range_policy;
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
@ -596,15 +345,132 @@ template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
using range_policy = typename MDRange::range_policy;
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
// Cuda specialization
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
template <typename MDRange, typename Functor>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
closure.execute();
}
#endif
// ------------------------------------------------------------------ //
// ------------------------------------------------------------------ //
//md_parallel_reduce
// ------------------------------------------------------------------ //
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( MDRange const& range
, Functor const& f
, ValueType & v
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( const std::string& str
, MDRange const& range
, Functor const& f
, ValueType & v
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
//using range_policy = typename MDRange::range_policy;
using range_policy = typename MDRange::impl_range_policy;
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
}
// Cuda - parallel_reduce not implemented yet
/*
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( MDRange const& range
, Functor const& f
, ValueType & v
, const std::string& str = ""
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
closure.execute();
}
template <typename MDRange, typename Functor, typename ValueType>
void md_parallel_reduce( const std::string& str
, MDRange const& range
, Functor const& f
, ValueType & v
, typename std::enable_if<( true
#if defined( KOKKOS_ENABLE_CUDA)
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
#endif
) >::type* = 0
)
{
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
closure.execute();
}
*/
}} // namespace Kokkos::Experimental
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

View File

@ -59,8 +59,14 @@ template< class T = void
, class Proxy = void
>
struct Array {
private:
T m_elem[N];
public:
/**
* The elements of this C array shall not be accessed directly. The data
* member has to be declared public to enable aggregate initialization as for
* std::array. We mark it as private in the documentation.
* @private
*/
T m_internal_implementation_private_member_data[N];
public:
typedef T & reference ;
@ -78,25 +84,32 @@ public:
KOKKOS_INLINE_FUNCTION
reference operator[]( const iType & i )
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
return m_elem[i];
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_internal_implementation_private_member_data[i];
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
const_reference operator[]( const iType & i ) const
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
return m_elem[i];
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_internal_implementation_private_member_data[i];
}
KOKKOS_INLINE_FUNCTION pointer data() { return & m_elem[0] ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
KOKKOS_INLINE_FUNCTION pointer data()
{
return & m_internal_implementation_private_member_data[0];
}
KOKKOS_INLINE_FUNCTION const_pointer data() const
{
return & m_internal_implementation_private_member_data[0];
}
~Array() = default ;
Array() = default ;
Array( const Array & ) = default ;
Array & operator = ( const Array & ) = default ;
// Do not default unless move and move-assignment are also defined
// ~Array() = default ;
// Array() = default ;
// Array( const Array & ) = default ;
// Array & operator = ( const Array & ) = default ;
// Some supported compilers are not sufficiently C++11 compliant
// for default move constructor and move assignment operator.
@ -124,7 +137,7 @@ public:
KOKKOS_INLINE_FUNCTION
value_type operator[]( const iType & )
{
static_assert( std::is_integral<iType>::value , "Must be integer argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
return value_type();
}
@ -132,7 +145,7 @@ public:
KOKKOS_INLINE_FUNCTION
value_type operator[]( const iType & ) const
{
static_assert( std::is_integral<iType>::value , "Must be integer argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
return value_type();
}
@ -181,7 +194,7 @@ public:
KOKKOS_INLINE_FUNCTION
reference operator[]( const iType & i )
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_elem[i];
}
@ -189,7 +202,7 @@ public:
KOKKOS_INLINE_FUNCTION
const_reference operator[]( const iType & i ) const
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_elem[i];
}
@ -250,7 +263,7 @@ public:
KOKKOS_INLINE_FUNCTION
reference operator[]( const iType & i )
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_elem[i*m_stride];
}
@ -258,7 +271,7 @@ public:
KOKKOS_INLINE_FUNCTION
const_reference operator[]( const iType & i ) const
{
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
return m_elem[i*m_stride];
}

View File

@ -102,6 +102,7 @@ KOKKOS_IMPL_IS_CONCEPT( memory_traits )
KOKKOS_IMPL_IS_CONCEPT( execution_space )
KOKKOS_IMPL_IS_CONCEPT( execution_policy )
KOKKOS_IMPL_IS_CONCEPT( array_layout )
KOKKOS_IMPL_IS_CONCEPT( reducer )
namespace Impl {

View File

@ -57,6 +57,10 @@
#include <Kokkos_OpenMP.hpp>
#endif
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <Kokkos_Qthreads.hpp>
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
#include <Kokkos_Threads.hpp>
#endif
@ -76,6 +80,7 @@
#include <Kokkos_Complex.hpp>
#include <iosfwd>
//----------------------------------------------------------------------------
@ -105,6 +110,9 @@ void finalize_all();
void fence();
/** \brief Print "Bill of Materials" */
void print_configuration( std::ostream & , const bool detail = false );
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -159,4 +167,3 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
//----------------------------------------------------------------------------
#endif

View File

@ -63,7 +63,7 @@ namespace Kokkos {
struct AUTO_t {
KOKKOS_INLINE_FUNCTION
constexpr const AUTO_t & operator()() const { return *this ; }
constexpr const AUTO_t & operator()() const { return *this; }
};
namespace {
@ -73,46 +73,49 @@ constexpr AUTO_t AUTO = Kokkos::AUTO_t();
struct InvalidType {};
}
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Forward declarations for class inter-relationships
namespace Kokkos {
class HostSpace ; ///< Memory space for main process and CPU execution spaces
class HostSpace; ///< Memory space for main process and CPU execution spaces
#ifdef KOKKOS_ENABLE_HBWSPACE
namespace Experimental {
class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
}
#endif
#if defined( KOKKOS_ENABLE_SERIAL )
class Serial ; ///< Execution space main process on CPU
#endif // defined( KOKKOS_ENABLE_SERIAL )
class Serial; ///< Execution space main process on CPU.
#endif
#if defined( KOKKOS_ENABLE_QTHREADS )
class Qthreads; ///< Execution space with Qthreads back-end.
#endif
#if defined( KOKKOS_ENABLE_PTHREAD )
class Threads ; ///< Execution space with pthreads back-end
class Threads; ///< Execution space with pthreads back-end.
#endif
#if defined( KOKKOS_ENABLE_OPENMP )
class OpenMP ; ///< OpenMP execution space
class OpenMP; ///< OpenMP execution space.
#endif
#if defined( KOKKOS_ENABLE_CUDA )
class CudaSpace ; ///< Memory space on Cuda GPU
class CudaUVMSpace ; ///< Memory space on Cuda GPU with UVM
class CudaHostPinnedSpace ; ///< Memory space on Host accessible to Cuda GPU
class Cuda ; ///< Execution space for Cuda GPU
class CudaSpace; ///< Memory space on Cuda GPU
class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM
class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU
class Cuda; ///< Execution space for Cuda GPU
#endif
template<class ExecutionSpace, class MemorySpace>
struct Device;
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Set the default execution space.
@ -122,60 +125,66 @@ struct Device;
namespace Kokkos {
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
typedef Cuda DefaultExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultExecutionSpace ;
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
typedef Cuda DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultExecutionSpace;
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
// typedef Qthreads DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultExecutionSpace;
#else
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
#endif
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_PTHREAD )
typedef Threads DefaultHostExecutionSpace ;
#elif defined ( KOKKOS_ENABLE_SERIAL )
typedef Serial DefaultHostExecutionSpace ;
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Threads DefaultHostExecutionSpace;
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
// typedef Qthreads DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
typedef Serial DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef OpenMP DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_PTHREAD )
typedef Threads DefaultHostExecutionSpace;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Qthreads DefaultHostExecutionSpace;
#elif defined( KOKKOS_ENABLE_SERIAL )
typedef Serial DefaultHostExecutionSpace;
#else
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
#endif
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
// Detect the active execution space and define its memory space.
// This is used to verify whether a running kernel can access
// a given memory space.
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_ENABLE_CUDA)
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace;
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
typedef Kokkos::HostSpace ActiveExecutionMemorySpace;
#else
typedef void ActiveExecutionMemorySpace ;
typedef void ActiveExecutionMemorySpace;
#endif
template< class ActiveSpace , class MemorySpace >
template< class ActiveSpace, class MemorySpace >
struct VerifyExecutionCanAccessMemorySpace {
enum {value = 0};
};
template< class Space >
struct VerifyExecutionCanAccessMemorySpace< Space , Space >
struct VerifyExecutionCanAccessMemorySpace< Space, Space >
{
enum {value = 1};
KOKKOS_INLINE_FUNCTION static void verify(void) {}
@ -183,27 +192,27 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
};
} // namespace Impl
} // namespace Kokkos
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )
#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
void fence();
}
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class Functor
@ -220,18 +229,18 @@ struct FunctorPolicyExecutionSpace;
///
/// This is an implementation detail of parallel_for. Users should
/// skip this and go directly to the nonmember function parallel_for.
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelFor ;
template< class FunctorType, class ExecPolicy, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
> class ParallelFor;
/// \class ParallelReduce
/// \brief Implementation detail of parallel_reduce.
///
/// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelReduce ;
template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
> class ParallelReduce;
/// \class ParallelScan
/// \brief Implementation detail of parallel_scan.
@ -239,10 +248,12 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
/// This is an implementation detail of parallel_scan. Users should
/// skip this and go directly to the documentation of the nonmember
/// template function Kokkos::parallel_scan.
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelScan ;
template< class FunctorType, class ExecPolicy, class ExecutionSapce =
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
> class ParallelScan;
} // namespace Impl
} // namespace Kokkos
}}
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */

View File

@ -62,7 +62,6 @@
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
@ -295,6 +294,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */

View File

@ -44,14 +44,16 @@
#ifndef KOKKOS_HBWSPACE_HPP
#define KOKKOS_HBWSPACE_HPP
#include <Kokkos_HostSpace.hpp>
/*--------------------------------------------------------------------------*/
#ifdef KOKKOS_ENABLE_HBWSPACE
namespace Kokkos {
namespace Experimental {
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
@ -67,7 +69,7 @@ void init_lock_array_hbw_space();
/// This function tries to aquire the lock for the hash value derived
/// from the provided ptr. If the lock is successfully aquired the
/// function returns true. Otherwise it returns false.
bool lock_address_hbw_space(void* ptr);
bool lock_address_hbw_space( void* ptr );
/// \brief Release lock for the address
///
@ -75,13 +77,16 @@ bool lock_address_hbw_space(void* ptr);
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
void unlock_address_hbw_space(void* ptr);
void unlock_address_hbw_space( void* ptr );
} // namespace Impl
} // neamspace Experimental
} // namespace Experimental
} // namespace Kokkos
namespace Kokkos {
namespace Experimental {
/// \class HBWSpace
@ -91,10 +96,9 @@ namespace Experimental {
/// memory means the usual CPU-accessible memory.
class HBWSpace {
public:
//! Tag this class as a kokkos memory space
typedef HBWSpace memory_space ;
typedef size_t size_type ;
typedef HBWSpace memory_space;
typedef size_t size_type;
/// \typedef execution_space
/// \brief Default execution space for this memory space.
@ -103,21 +107,25 @@ public:
/// useful for things like initializing a View (which happens in
/// parallel using the View's default execution space).
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Kokkos::Threads execution_space ;
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_PTHREAD )
typedef Kokkos::Threads execution_space ;
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_SERIAL )
typedef Kokkos::Serial execution_space ;
typedef Kokkos::Serial execution_space;
#else
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
#endif
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef Kokkos::Device< execution_space, memory_space > device_type;
/*--------------------------------*/
/* Functions unique to the HBWSpace */
@ -129,67 +137,68 @@ public:
/**\brief Default memory space instance */
HBWSpace();
HBWSpace( const HBWSpace & rhs ) = default ;
HBWSpace & operator = ( const HBWSpace & ) = default ;
~HBWSpace() = default ;
HBWSpace( const HBWSpace & rhs ) = default;
HBWSpace & operator = ( const HBWSpace & ) = default;
~HBWSpace() = default;
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
explicit
HBWSpace( const AllocationMechanism & );
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
void * allocate( const size_t arg_alloc_size ) const;
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
, const size_t arg_alloc_size ) const;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
private:
AllocationMechanism m_alloc_mech ;
AllocationMechanism m_alloc_mech;
static constexpr const char* m_name = "HBW";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
};
} // namespace Experimental
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
: public SharedAllocationRecord< void , void >
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
: public SharedAllocationRecord< void, void >
{
private:
friend Kokkos::Experimental::HBWSpace ;
friend Kokkos::Experimental::HBWSpace;
typedef SharedAllocationRecord< void , void > RecordBase ;
typedef SharedAllocationRecord< void, void > RecordBase;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
static void deallocate( RecordBase * );
/**\brief Root record for tracked allocations from this HBWSpace instance */
static RecordBase s_root_record ;
static RecordBase s_root_record;
const Kokkos::Experimental::HBWSpace m_space ;
const Kokkos::Experimental::HBWSpace m_space;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() = default ;
SharedAllocationRecord() = default;
SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
, const std::string & arg_label
@ -212,9 +221,9 @@ public:
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
#else
return (SharedAllocationRecord *) 0 ;
return (SharedAllocationRecord *) 0;
#endif
}
@ -233,88 +242,93 @@ public:
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );
template<>
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
enum { assignable = true };
enum { accessible = true };
enum { deepcopy = true };
};
template<>
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
enum { assignable = false };
enum { accessible = true };
enum { deepcopy = true };
};
}}
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<class ExecutionSpace>
struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
template< class ExecutionSpace >
struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
DeepCopy( void * dst, const void * src, size_t n ) {
memcpy( dst, src, n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
exec.fence();
memcpy( dst , src , n );
memcpy( dst, src, n );
}
};
template<class ExecutionSpace>
struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
template< class ExecutionSpace >
struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
DeepCopy( void * dst, const void * src, size_t n ) {
memcpy( dst, src, n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
exec.fence();
memcpy( dst , src , n );
memcpy( dst, src, n );
}
};
template<class ExecutionSpace>
struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
template< class ExecutionSpace >
struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
DeepCopy( void * dst, const void * src, size_t n ) {
memcpy( dst, src, n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
exec.fence();
memcpy( dst , src , n );
memcpy( dst, src, n );
}
};
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
{
enum { value = true };
inline static void verify( void ) { }
@ -322,7 +336,7 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experime
};
template<>
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
{
enum { value = true };
inline static void verify( void ) { }
@ -330,8 +344,9 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kok
};
} // namespace Impl
} // namespace Kokkos
#endif
#endif /* #define KOKKOS_HBWSPACE_HPP */
#endif // #define KOKKOS_HBWSPACE_HPP

View File

@ -60,6 +60,7 @@
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
/// \brief Initialize lock array for arbitrary size atomics.
@ -83,9 +84,10 @@ bool lock_address_host_space(void* ptr);
/// from the provided ptr. This function should only be called
/// after previously successfully aquiring a lock with
/// lock_address.
void unlock_address_host_space(void* ptr);
void unlock_address_host_space( void* ptr );
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
@ -97,10 +99,9 @@ namespace Kokkos {
/// memory means the usual CPU-accessible memory.
class HostSpace {
public:
//! Tag this class as a kokkos memory space
typedef HostSpace memory_space ;
typedef size_t size_type ;
typedef HostSpace memory_space;
typedef size_t size_type;
/// \typedef execution_space
/// \brief Default execution space for this memory space.
@ -109,21 +110,25 @@ public:
/// useful for things like initializing a View (which happens in
/// parallel using the View's default execution space).
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
typedef Kokkos::Threads execution_space ;
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_OPENMP )
typedef Kokkos::OpenMP execution_space ;
typedef Kokkos::OpenMP execution_space;
#elif defined( KOKKOS_ENABLE_PTHREAD )
typedef Kokkos::Threads execution_space ;
typedef Kokkos::Threads execution_space;
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// typedef Kokkos::Qthreads execution_space;
#elif defined( KOKKOS_ENABLE_SERIAL )
typedef Kokkos::Serial execution_space ;
typedef Kokkos::Serial execution_space;
#else
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
#endif
//! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef Kokkos::Device< execution_space, memory_space > device_type;
/*--------------------------------*/
/* Functions unique to the HostSpace */
@ -135,61 +140,57 @@ public:
/**\brief Default memory space instance */
HostSpace();
HostSpace( HostSpace && rhs ) = default ;
HostSpace( const HostSpace & rhs ) = default ;
HostSpace & operator = ( HostSpace && ) = default ;
HostSpace & operator = ( const HostSpace & ) = default ;
~HostSpace() = default ;
HostSpace( HostSpace && rhs ) = default;
HostSpace( const HostSpace & rhs ) = default;
HostSpace & operator = ( HostSpace && ) = default;
HostSpace & operator = ( const HostSpace & ) = default;
~HostSpace() = default;
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
explicit
HostSpace( const AllocationMechanism & );
/**\brief Allocate untracked memory in the space */
void * allocate( const size_t arg_alloc_size ) const ;
void * allocate( const size_t arg_alloc_size ) const;
/**\brief Deallocate untracked memory in the space */
void deallocate( void * const arg_alloc_ptr
, const size_t arg_alloc_size ) const ;
, const size_t arg_alloc_size ) const;
/**\brief Return Name of the MemorySpace */
static constexpr const char* name();
private:
AllocationMechanism m_alloc_mech ;
AllocationMechanism m_alloc_mech;
static constexpr const char* m_name = "Host";
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
template< typename S >
struct HostMirror {
private:
// If input execution space can access HostSpace then keep it.
// Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
< typename S::execution_space::memory_space , Kokkos::HostSpace >
::accessible };
< typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };
// If HostSpace can access memory space then keep it.
// Example: Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
< Kokkos::HostSpace , typename S::memory_space >::accessible };
< Kokkos::HostSpace, typename S::memory_space >::accessible };
public:
@ -202,42 +203,41 @@ public:
, typename S::memory_space >
, Kokkos::HostSpace
>::type
>::type Space ;
>::type Space;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class SharedAllocationRecord< Kokkos::HostSpace , void >
: public SharedAllocationRecord< void , void >
class SharedAllocationRecord< Kokkos::HostSpace, void >
: public SharedAllocationRecord< void, void >
{
private:
friend Kokkos::HostSpace;
friend Kokkos::HostSpace ;
typedef SharedAllocationRecord< void, void > RecordBase;
typedef SharedAllocationRecord< void , void > RecordBase ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
static void deallocate( RecordBase * );
/**\brief Root record for tracked allocations from this HostSpace instance */
static RecordBase s_root_record ;
static RecordBase s_root_record;
const Kokkos::HostSpace m_space ;
const Kokkos::HostSpace m_space;
protected:
~SharedAllocationRecord();
SharedAllocationRecord() = default ;
SharedAllocationRecord() = default;
SharedAllocationRecord( const Kokkos::HostSpace & arg_space
, const std::string & arg_label
@ -260,12 +260,13 @@ public:
)
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
#else
return (SharedAllocationRecord *) 0 ;
return (SharedAllocationRecord *) 0;
#endif
}
/**\brief Allocate tracked memory in the space */
static
void * allocate_tracked( const Kokkos::HostSpace & arg_space
@ -281,37 +282,37 @@ public:
static
void deallocate_tracked( void * const arg_alloc_ptr );
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;
template<class ExecutionSpace>
struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
DeepCopy( void * dst , const void * src , size_t n ) {
memcpy( dst , src , n );
template< class ExecutionSpace >
struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
DeepCopy( void * dst, const void * src, size_t n ) {
memcpy( dst, src, n );
}
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
exec.fence();
memcpy( dst , src , n );
memcpy( dst, src, n );
}
};
} // namespace Impl
} // namespace Kokkos
#endif /* #define KOKKOS_HOSTSPACE_HPP */
#endif // #define KOKKOS_HOSTSPACE_HPP

View File

@ -45,22 +45,20 @@
#define KOKKOS_MACROS_HPP
//----------------------------------------------------------------------------
/** Pick up configure/build options via #define macros:
/** Pick up configure / build options via #define macros:
*
* KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces
* KOKKOS_ENABLE_PTHREAD Kokkos::Threads execution space
* KOKKOS_ENABLE_QTHREAD Kokkos::Qthread execution space
* KOKKOS_ENABLE_QTHREADS Kokkos::Qthreads execution space
* KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space
* KOKKOS_ENABLE_HWLOC HWLOC library is available
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK insert array bounds checks, is expensive!
*
* KOKKOS_ENABLE_MPI negotiate MPI/execution space interactions
*
* KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space
* KOKKOS_ENABLE_HWLOC HWLOC library is available.
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive!
* KOKKOS_ENABLE_MPI Negotiate MPI/execution space interactions.
* KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space.
*/
#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
#include <KokkosCore_config.h>
#include <KokkosCore_config.h>
#endif
#include <impl/Kokkos_OldMacros.hpp>
@ -86,7 +84,7 @@
* KOKKOS_ENABLE_INTEL_ATOMICS
* KOKKOS_ENABLE_OPENMP_ATOMICS
*
* A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
* A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
*
* Macros for marking functions to run in an execution space:
*
@ -98,64 +96,63 @@
//----------------------------------------------------------------------------
#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
// Compiling with a CUDA compiler.
//
// Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
// CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
//
// When generating device code the __CUDA_ARCH__ macro is defined as:
// __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
/* Compiling with a CUDA compiler.
*
* Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
* CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
*
* When generating device code the __CUDA_ARCH__ macro is defined as:
* __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
*/
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda.h>
#if !defined( CUDA_VERSION )
#error "#include <cuda.h> did not define CUDA_VERSION."
#endif
#if ! defined( CUDA_VERSION )
#error "#include <cuda.h> did not define CUDA_VERSION"
#endif
#if ( CUDA_VERSION < 7000 )
// CUDA supports C++11 in device code starting with version 7.0.
// This includes auto type and device code internal lambdas.
#error "Cuda version 7.0 or greater required."
#endif
#if ( CUDA_VERSION < 7000 )
// CUDA supports C++11 in device code starting with
// version 7.0. This includes auto type and device code internal
// lambdas.
#error "Cuda version 7.0 or greater required"
#endif
#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
// Compiling with CUDA compiler for device code.
#error "Cuda device capability >= 3.0 is required."
#endif
#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
/* Compiling with CUDA compiler for device code. */
#error "Cuda device capability >= 3.0 is required"
#endif
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
#if ( CUDA_VERSION < 7050 )
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
#if ( CUDA_VERSION < 7050 )
// CUDA supports C++11 lambdas generated in host code to be given
// to the device starting with version 7.5. But the release candidate (7.5.6)
// still identifies as 7.0
#error "Cuda version 7.5 or greater required for host-to-device Lambda support"
#endif
#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__)
// still identifies as 7.0.
#error "Cuda version 7.5 or greater required for host-to-device Lambda support."
#endif
#if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
#define KOKKOS_LAMBDA [=]__device__
#else
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
#if defined( KOKKOS_ENABLE_CXX1Z )
#define KOKKOS_CLASS_LAMBDA [=,*this] __host__ __device__
#endif
#endif
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) */
#endif
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
// Cuda version 8.0 still needs the functor wrapper
#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__)
#if /* ( CUDA_VERSION < 8000 ) && */ defined( __NVCC__ )
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
#endif
#endif
/*--------------------------------------------------------------------------*/
/* Language info: C++, CUDA, OPENMP */
//----------------------------------------------------------------------------
// Language info: C++, CUDA, OPENMP
#if defined( KOKKOS_ENABLE_CUDA )
// Compiling Cuda code to 'ptx'
@ -163,20 +160,17 @@
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
#define KOKKOS_FUNCTION __device__ __host__
#endif /* #if defined( __CUDA_ARCH__ ) */
#endif // #if defined( __CUDA_ARCH__ )
#if defined( _OPENMP )
// Compiling with OpenMP.
// The value of _OPENMP is an integer value YYYYMM
// where YYYY and MM are the year and month designation
// of the supported OpenMP API version.
#endif // #if defined( _OPENMP )
/* Compiling with OpenMP.
* The value of _OPENMP is an integer value YYYYMM
* where YYYY and MM are the year and month designation
* of the supported OpenMP API version.
*/
#endif /* #if defined( _OPENMP ) */
/*--------------------------------------------------------------------------*/
/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
//----------------------------------------------------------------------------
// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros
#if defined( __NVCC__ )
// NVIDIA compiler is being used.
@ -184,29 +178,28 @@
// Host code is compiled again with another compiler.
// Device code is compile to 'ptx'.
#define KOKKOS_COMPILER_NVCC __NVCC__
#else
#if ! defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
#if !defined (KOKKOS_ENABLE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either
#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
#if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
// CUDA (including version 6.5) does not support giving lambdas as
// arguments to global functions. Thus its not currently possible
// to dispatch lambdas from the host.
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
#endif
#endif
#endif /* #if defined( __NVCC__ ) */
#endif // #if defined( __NVCC__ )
#if !defined (KOKKOS_LAMBDA)
#if !defined( KOKKOS_LAMBDA )
#define KOKKOS_LAMBDA [=]
#endif
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA)
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
#define KOKKOS_CLASS_LAMBDA [=,*this]
#endif
//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.
/* Intel compiler for host code */
// Intel compiler for host code.
#if defined( __INTEL_COMPILER )
#define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
@ -218,7 +211,7 @@
#define KOKKOS_COMPILER_INTEL __ECC
#endif
/* CRAY compiler for host code */
// CRAY compiler for host code
#if defined( _CRAYC )
#define KOKKOS_COMPILER_CRAYC _CRAYC
#endif
@ -234,38 +227,41 @@
#define KOKKOS_COMPILER_APPLECC __APPLE_CC__
#endif
#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
#define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
#endif
#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
#define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
#if ( 472 > KOKKOS_COMPILER_GNU )
#error "Compiling with GCC version earlier than 4.7.2 is not supported."
#endif
#endif
#if defined( __PGIC__ ) && ! defined( __GNUC__ )
#if defined( __PGIC__ ) && !defined( __GNUC__ )
#define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
#if ( 1540 > KOKKOS_COMPILER_PGI )
#error "Compiling with PGI version earlier than 15.4 is not supported."
#endif
#endif
//#endif /* #if ! defined( __CUDA_ARCH__ ) */
//#endif // #if !defined( __CUDA_ARCH__ )
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Intel compiler macros */
//----------------------------------------------------------------------------
// Intel compiler macros
#if defined( KOKKOS_COMPILER_INTEL )
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#if ( __INTEL_COMPILER > 1400 )
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
#endif
#define KOKKOS_RESTRICT __restrict__
#ifndef KOKKOS_ALIGN
@ -287,12 +283,13 @@
#warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
#endif
#endif
#if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
#define KOKKOS_ENABLE_ASM 1
#endif
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#if !defined (_WIN32)
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#if !defined( _WIN32 )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#else
#define KOKKOS_FORCEINLINE_FUNCTION inline
@ -302,192 +299,170 @@
#if defined( __MIC__ )
// Compiling for Xeon Phi
#endif
#endif
/*--------------------------------------------------------------------------*/
/* Cray compiler macros */
//----------------------------------------------------------------------------
// Cray compiler macros
#if defined( KOKKOS_COMPILER_CRAYC )
#endif
/*--------------------------------------------------------------------------*/
/* IBM Compiler macros */
//----------------------------------------------------------------------------
// IBM Compiler macros
#if defined( KOKKOS_COMPILER_IBM )
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#endif
/*--------------------------------------------------------------------------*/
/* CLANG compiler macros */
//----------------------------------------------------------------------------
// CLANG compiler macros
#if defined( KOKKOS_COMPILER_CLANG )
//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#endif
/*--------------------------------------------------------------------------*/
/* GNU Compiler macros */
//----------------------------------------------------------------------------
// GNU Compiler macros
#if defined( KOKKOS_COMPILER_GNU )
//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
#endif
#if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \
( defined( __amd64 ) || \
defined( __amd64__ ) || \
defined( __x86_64 ) || \
defined( __x86_64__ ) )
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
( defined( __amd64 ) || defined( __amd64__ ) || \
defined( __x86_64 ) || defined( __x86_64__ ) )
#define KOKKOS_ENABLE_ASM 1
#endif
#endif
/*--------------------------------------------------------------------------*/
//----------------------------------------------------------------------------
#if defined( KOKKOS_COMPILER_PGI )
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
#endif
/*--------------------------------------------------------------------------*/
//----------------------------------------------------------------------------
#if defined( KOKKOS_COMPILER_NVCC )
#if defined(__CUDA_ARCH__ )
#if defined( __CUDA_ARCH__ )
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
#endif
#endif
//----------------------------------------------------------------------------
/** Define function marking macros if compiler specific macros are undefined: */
// Define function marking macros if compiler specific macros are undefined:
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
#define KOKKOS_FORCEINLINE_FUNCTION inline
#endif
#if ! defined( KOKKOS_INLINE_FUNCTION )
#define KOKKOS_INLINE_FUNCTION inline
#if !defined( KOKKOS_INLINE_FUNCTION )
#define KOKKOS_INLINE_FUNCTION inline
#endif
#if ! defined( KOKKOS_FUNCTION )
#define KOKKOS_FUNCTION /**/
#endif
//----------------------------------------------------------------------------
///** Define empty macro for restrict if necessary: */
#if ! defined(KOKKOS_RESTRICT)
#define KOKKOS_RESTRICT
#if !defined( KOKKOS_FUNCTION )
#define KOKKOS_FUNCTION /**/
#endif
//----------------------------------------------------------------------------
/** Define Macro for alignment: */
#if ! defined KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 16
#endif
// Define empty macro for restrict if necessary:
#if ! defined(KOKKOS_ALIGN)
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#endif
#if ! defined(KOKKOS_ALIGN_PTR)
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
#if !defined( KOKKOS_RESTRICT )
#define KOKKOS_RESTRICT
#endif
//----------------------------------------------------------------------------
/** Determine the default execution space for parallel dispatch.
* There is zero or one default execution space specified.
*/
#if 1 < ( ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
// Define Macro for alignment:
#if !defined KOKKOS_ALIGN_SIZE
#define KOKKOS_ALIGN_SIZE 16
#endif
/** If default is not specified then chose from enabled execution spaces.
* Priority: CUDA, OPENMP, THREADS, SERIAL
*/
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined ( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined ( KOKKOS_ENABLE_OPENMP )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
#elif defined ( KOKKOS_ENABLE_PTHREAD )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
#if !defined( KOKKOS_ALIGN )
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
#endif
#if !defined( KOKKOS_ALIGN_PTR )
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
#endif
//----------------------------------------------------------------------------
// Determine the default execution space for parallel dispatch.
// There is zero or one default execution space specified.
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
#error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
#endif
// If default is not specified then chose from enabled execution spaces.
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
#elif defined( KOKKOS_ENABLE_OPENMP )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
#elif defined( KOKKOS_ENABLE_PTHREAD )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
//#elif defined( KOKKOS_ENABLE_QTHREADS )
// #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
#else
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
#endif
//----------------------------------------------------------------------------
/** Determine for what space the code is being compiled: */
// Determine for what space the code is being compiled:
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_ENABLE_CUDA)
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
#else
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
#endif
#if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
#endif
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/**Enable Profiling by default**/
// Enable Profiling by default
#ifndef KOKKOS_ENABLE_PROFILING
#define KOKKOS_ENABLE_PROFILING 1
#define KOKKOS_ENABLE_PROFILING 1
#endif
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #ifndef KOKKOS_MACROS_HPP */
#endif // #ifndef KOKKOS_MACROS_HPP

View File

@ -1294,6 +1294,7 @@ public:
KOKKOS_INLINE_FUNCTION
size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
KOKKOS_INLINE_FUNCTION
size_t get_mem_size() const { return m_data_size; }
private:

View File

@ -66,7 +66,6 @@
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
@ -196,6 +195,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */

View File

@ -78,16 +78,14 @@ struct pair
/// This calls the default constructors of T1 and T2. It won't
/// compile if those default constructors are not defined and
/// public.
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first(), second()
{}
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair() = default ;
/// \brief Constructor that takes both elements of the pair.
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(first_type const& f, second_type const& s)
: first(f), second(s)
{}
@ -97,7 +95,7 @@ struct pair
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
@ -107,7 +105,7 @@ struct pair
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const volatile pair<U,V> &p)
: first(p.first), second(p.second)
{}
@ -183,7 +181,7 @@ struct pair<T1&, T2&>
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(first_type f, second_type s)
: first(f), second(s)
{}
@ -193,7 +191,7 @@ struct pair<T1&, T2&>
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
@ -247,7 +245,7 @@ struct pair<T1, T2&>
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(first_type const& f, second_type s)
: first(f), second(s)
{}
@ -257,7 +255,7 @@ struct pair<T1, T2&>
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
@ -311,7 +309,7 @@ struct pair<T1&, T2>
///
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(first_type f, second_type const& s)
: first(f), second(s)
{}
@ -321,7 +319,7 @@ struct pair<T1&, T2>
/// This calls the copy constructors of T1 and T2. It won't compile
/// if those copy constructors are not defined and public.
template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const pair<U,V> &p)
: first(p.first), second(p.second)
{}
@ -366,31 +364,31 @@ bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
//! Inequality operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs==rhs); }
//! Less-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator< (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
//! Less-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(rhs<lhs); }
//! Greater-than operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator> (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return rhs<lhs; }
//! Greater-than-or-equal-to operator for Kokkos::pair.
template <class T1, class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
{ return !(lhs<rhs); }
@ -399,7 +397,7 @@ bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
/// This is a "nonmember constructor" for Kokkos::pair. It works just
/// like std::make_pair.
template <class T1,class T2>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair<T1,T2> make_pair (T1 x, T2 y)
{ return ( pair<T1,T2>(x,y) ); }
@ -460,23 +458,21 @@ struct pair<T1,void>
first_type first;
enum { second = 0 };
KOKKOS_FORCEINLINE_FUNCTION
pair()
: first()
{}
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair() = default ;
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(const first_type & f)
: first(f)
{}
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair(const first_type & f, int)
: first(f)
{}
template <class U>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
pair( const pair<U,void> &p)
: first(p.first)
{}
@ -495,32 +491,32 @@ struct pair<T1,void>
//
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first==rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs==rhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator< (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return lhs.first<rhs.first; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(rhs<lhs); }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator> (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return rhs<lhs; }
template <class T1>
KOKKOS_FORCEINLINE_FUNCTION
KOKKOS_FORCEINLINE_FUNCTION constexpr
bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
{ return !(lhs<rhs); }
@ -528,3 +524,4 @@ bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
#endif //KOKKOS_PAIR_HPP

View File

@ -52,13 +52,14 @@
#include <Kokkos_View.hpp>
#include <Kokkos_ExecPolicy.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <typeinfo>
#endif
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_FunctorAnalysis.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#ifdef KOKKOS_DEBUG
@ -175,7 +176,7 @@ void parallel_for( const ExecPolicy & policy
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
)
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -188,7 +189,7 @@ void parallel_for( const ExecPolicy & policy
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
@ -207,7 +208,7 @@ void parallel_for( const size_t work_count
execution_space ;
typedef RangePolicy< execution_space > policy ;
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -220,7 +221,7 @@ void parallel_for( const size_t work_count
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelFor(kpID);
}
@ -417,7 +418,7 @@ void parallel_scan( const ExecutionPolicy & policy
, typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
)
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -430,7 +431,7 @@ void parallel_scan( const ExecutionPolicy & policy
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}
@ -450,7 +451,7 @@ void parallel_scan( const size_t work_count
typedef Kokkos::RangePolicy< execution_space > policy ;
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
@ -463,7 +464,7 @@ void parallel_scan( const size_t work_count
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelScan(kpID);
}

View File

@ -1094,7 +1094,7 @@ namespace Impl {
const PolicyType& policy,
const FunctorType& functor,
ReturnType& return_value) {
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
@ -1116,7 +1116,7 @@ namespace Impl {
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelReduce(kpID);
}

View File

@ -41,52 +41,70 @@
//@HEADER
*/
#ifndef KOKKOS_QTHREAD_HPP
#define KOKKOS_QTHREAD_HPP
#ifndef KOKKOS_QTHREADS_HPP
#define KOKKOS_QTHREADS_HPP
#include <Kokkos_Core_fwd.hpp>
#ifdef KOKKOS_ENABLE_QTHREADS
// Defines to enable experimental Qthreads functionality.
#define QTHREAD_LOCAL_PRIORITY
#define CLONED_TASKS
#include <qthread.h>
#include <cstddef>
#include <iosfwd>
#include <Kokkos_Core.hpp>
#include <Kokkos_Layout.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <Kokkos_HostSpace.hpp>
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp>
//#include <Kokkos_MemoryTraits.hpp>
//#include <Kokkos_ExecPolicy.hpp>
//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
#include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
class QthreadExec ;
class QthreadsExec;
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
/** \brief Execution space supported by Qthread */
class Qthread {
/** \brief Execution space supported by Qthreads */
class Qthreads {
public:
//! \name Type declarations that all Kokkos devices must provide.
//@{
//! Tag this class as an execution space
typedef Qthread execution_space ;
typedef Kokkos::HostSpace memory_space ;
typedef Qthreads execution_space;
typedef Kokkos::HostSpace memory_space;
//! This execution space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type;
typedef Kokkos::Device< execution_space, memory_space > device_type;
typedef Kokkos::LayoutRight array_layout ;
typedef memory_space::size_type size_type ;
typedef Kokkos::LayoutRight array_layout;
typedef memory_space::size_type size_type;
typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
typedef ScratchMemorySpace< Qthreads > scratch_memory_space;
//@}
/*------------------------------------------------------------------------*/
/** \brief Initialization will construct one or more instances */
static Qthread & instance( int = 0 );
static Qthreads & instance( int = 0 );
/** \brief Set the execution space to a "sleep" state.
*
@ -128,26 +146,24 @@ public:
static void finalize();
/** \brief Print configuration information to the given output stream. */
static void print_configuration( std::ostream & , const bool detail = false );
static void print_configuration( std::ostream &, const bool detail = false );
int shepherd_size() const ;
int shepherd_worker_size() const ;
int shepherd_size() const;
int shepherd_worker_size() const;
};
/*--------------------------------------------------------------------------*/
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
template<>
struct MemorySpaceAccess
< Kokkos::Qthread::memory_space
, Kokkos::Qthread::scratch_memory_space
< Kokkos::Qthreads::memory_space
, Kokkos::Qthreads::scratch_memory_space
>
{
enum { assignable = false };
@ -157,27 +173,26 @@ struct MemorySpaceAccess
template<>
struct VerifyExecutionCanAccessMemorySpace
< Kokkos::Qthread::memory_space
, Kokkos::Qthread::scratch_memory_space
< Kokkos::Qthreads::memory_space
, Kokkos::Qthreads::scratch_memory_space
>
{
enum { value = true };
inline static void verify( void ) { }
inline static void verify( const void * ) { }
inline static void verify( void ) {}
inline static void verify( const void * ) {}
};
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
#include <Kokkos_Parallel.hpp>
#include <Qthread/Kokkos_QthreadExec.hpp>
#include <Qthread/Kokkos_Qthread_Parallel.hpp>
#include <Qthreads/Kokkos_QthreadsExec.hpp>
#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
#endif /* #define KOKKOS_QTHREAD_HPP */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif // #define KOKKOS_ENABLE_QTHREADS
#endif // #define KOKKOS_QTHREADS_HPP

View File

@ -56,6 +56,8 @@
#include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <impl/Kokkos_FunctorAnalysis.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
@ -138,30 +140,15 @@ public:
static void initialize( unsigned threads_count = 1 ,
unsigned use_numa_count = 0 ,
unsigned use_cores_per_numa = 0 ,
bool allow_asynchronous_threadpool = false) {
(void) threads_count;
(void) use_numa_count;
(void) use_cores_per_numa;
(void) allow_asynchronous_threadpool;
bool allow_asynchronous_threadpool = false);
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
static int is_initialized() { return 1 ; }
static int is_initialized();
/** \brief Return the maximum amount of concurrency. */
static int concurrency() {return 1;};
//! Free any resources being consumed by the device.
static void finalize() {
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
static void finalize();
//! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -177,10 +164,6 @@ public:
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
//--------------------------------------------------------------------------
static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
//--------------------------------------------------------------------------
};
} // namespace Kokkos
@ -213,22 +196,6 @@ struct VerifyExecutionCanAccessMemorySpace
inline static void verify( const void * ) { }
};
namespace SerialImpl {
struct Sentinel {
void * m_scratch ;
unsigned m_reduce_end ;
unsigned m_shared_end ;
Sentinel();
~Sentinel();
static Sentinel & singleton();
};
inline
unsigned align( unsigned n );
}
} // namespace Impl
} // namespace Kokkos
@ -238,89 +205,26 @@ unsigned align( unsigned n );
namespace Kokkos {
namespace Impl {
class SerialTeamMember {
private:
typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
const scratch_memory_space m_space ;
const int m_league_rank ;
const int m_league_size ;
// Resize thread team data scratch memory
void serial_resize_thread_team_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes );
SerialTeamMember & operator = ( const SerialTeamMember & );
HostThreadTeamData * serial_get_thread_team_data();
public:
} /* namespace Impl */
} /* namespace Kokkos */
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_shmem() const { return m_space ; }
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_scratch(int) const
{ return m_space ; }
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & thread_scratch(int) const
{ return m_space ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(const ValueType& , const int& ) const {}
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION
ValueType team_reduce( const ValueType & value , const JoinOp & ) const
{
return value ;
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
{
const Type tmp = global_accum ? *global_accum : Type(0) ;
if ( global_accum ) { *global_accum += value ; }
return tmp ;
}
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
{ return Type(0); }
//----------------------------------------
// Execution space specific:
SerialTeamMember( int arg_league_rank
, int arg_league_size
, int arg_shared_size
);
};
} // namespace Impl
namespace Kokkos {
namespace Impl {
/*
* < Kokkos::Serial , WorkArgTag >
* < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
*
*/
namespace Impl {
template< class ... Properties >
class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
{
@ -441,14 +345,11 @@ public:
return p;
};
typedef Impl::SerialTeamMember member_type ;
typedef Impl::HostThreadTeamMember< Kokkos::Serial > member_type ;
};
} /* namespace Impl */
} /* namespace Kokkos */
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Parallel patterns for Kokkos::Serial with RangePolicy */
@ -521,11 +422,12 @@ private:
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -535,34 +437,25 @@ private:
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( reference_type update ) const
{
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( reference_type update ) const
{
const TagType t{} ;
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
public:
@ -570,10 +463,29 @@ public:
inline
void execute() const
{
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
const size_t team_reduce_size = 0 ; // Never shrinks
const size_t team_shared_size = 0 ; // Never shrinks
const size_t thread_local_size = 0 ; // Never shrinks
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
serial_resize_thread_team_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
HostThreadTeamData & data = *serial_get_thread_team_data();
pointer_type ptr =
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
this-> template exec< WorkTag >( update );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class HostViewType >
@ -587,7 +499,7 @@ public:
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() )
, m_result_ptr( arg_result_view.data() )
{
static_assert( Kokkos::is_view< HostViewType >::value
, "Kokkos::Serial reduce result must be a View" );
@ -623,11 +535,13 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -635,10 +549,8 @@ private:
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( reference_type update ) const
{
reference_type update = ValueInit::init( m_functor , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update , true );
@ -648,11 +560,9 @@ private:
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( reference_type update ) const
{
const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr );
const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update , true );
@ -664,9 +574,22 @@ public:
inline
void execute() const
{
pointer_type ptr = (pointer_type)
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
this-> template exec< WorkTag >( ptr );
const size_t pool_reduce_size = Analysis::value_size( m_functor );
const size_t team_reduce_size = 0 ; // Never shrinks
const size_t team_shared_size = 0 ; // Never shrinks
const size_t thread_local_size = 0 ; // Never shrinks
serial_resize_thread_team_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
HostThreadTeamData & data = *serial_get_thread_team_data();
reference_type update =
ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
this-> template exec< WorkTag >( update );
}
inline
@ -696,6 +619,8 @@ class ParallelFor< FunctorType
{
private:
enum { TEAM_REDUCE_SIZE = 512 };
typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
typedef typename Policy::member_type Member ;
@ -706,21 +631,21 @@ private:
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec() const
exec( HostThreadTeamData & data ) const
{
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) );
m_functor( Member(data,ileague,m_league) );
}
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec() const
exec( HostThreadTeamData & data ) const
{
const TagType t{} ;
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) );
m_functor( t , Member(data,ileague,m_league) );
}
}
@ -729,15 +654,28 @@ public:
inline
void execute() const
{
Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
this-> template exec< typename Policy::work_tag >();
const size_t pool_reduce_size = 0 ; // Never shrinks
const size_t team_reduce_size = TEAM_REDUCE_SIZE ;
const size_t team_shared_size = m_shared ;
const size_t thread_local_size = 0 ; // Never shrinks
serial_resize_thread_team_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
HostThreadTeamData & data = *serial_get_thread_team_data();
this->template exec< typename Policy::work_tag >( data );
}
ParallelFor( const FunctorType & arg_functor
, const Policy & arg_policy )
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
, m_shared( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
{ }
};
@ -752,18 +690,22 @@ class ParallelReduce< FunctorType
{
private:
enum { TEAM_REDUCE_SIZE = 512 };
typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const int m_league ;
@ -774,33 +716,23 @@ private:
template< class TagType >
inline
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( HostThreadTeamData & data , reference_type update ) const
{
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) , update );
m_functor( Member(data,ileague,m_league) , update );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class TagType >
inline
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const
exec( HostThreadTeamData & data , reference_type update ) const
{
const TagType t{} ;
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) , update );
m_functor( t , Member(data,ileague,m_league) , update );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
public:
@ -808,10 +740,31 @@ public:
inline
void execute() const
{
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
const size_t team_reduce_size = TEAM_REDUCE_SIZE ;
const size_t team_shared_size = m_shared ;
const size_t thread_local_size = 0 ; // Never shrinks
serial_resize_thread_team_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
HostThreadTeamData & data = *serial_get_thread_team_data();
pointer_type ptr =
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
this-> template exec< WorkTag >( data , update );
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
}
template< class ViewType >
@ -825,8 +778,10 @@ public:
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
, m_result_ptr( arg_result.data() )
, m_shared( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
{
static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -844,7 +799,9 @@ public:
, m_league( arg_policy.league_size() )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shared( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
@ -858,261 +815,6 @@ public:
/*--------------------------------------------------------------------------*/
/*--------------------------------------------------------------------------*/
/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
namespace Kokkos {
namespace Impl {
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
typedef iType index_type;
const iType begin ;
const iType end ;
enum {increment = 1};
const SerialTeamMember& thread;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
: begin(0)
, end(arg_count)
, thread(arg_thread)
{}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
: begin( arg_begin )
, end( arg_end)
, thread( arg_thread )
{}
};
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
typedef iType index_type;
enum {start = 0};
const iType end;
enum {increment = 1};
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
end( count )
{}
};
} // namespace Impl
template< typename iType >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count );
}
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::SerialTeamMember >
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end )
{
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
}
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
lambda(val);
}
}
//----------------------------------------------------------------------------
#include <impl/Kokkos_Serial_Task.hpp>

View File

@ -82,6 +82,15 @@ class Future ;
template< typename Space >
class TaskScheduler ;
template< typename Space >
void wait( TaskScheduler< Space > const & );
template< typename Space >
struct is_scheduler : public std::false_type {};
template< typename Space >
struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
} // namespace Kokkos
#include <impl/Kokkos_TaskQueue.hpp>
@ -109,9 +118,6 @@ namespace Impl {
template< typename Space , typename ResultType , typename FunctorType >
class TaskBase ;
template< typename Space >
class TaskExec ;
} // namespace Impl
} // namespace Kokkos
@ -312,6 +318,19 @@ public:
}
};
// Is a Future with the given execution space
template< typename , typename ExecSpace = void >
struct is_future : public std::false_type {};
template< typename Arg1 , typename Arg2 , typename ExecSpace >
struct is_future< Future<Arg1,Arg2> , ExecSpace >
: public std::integral_constant
< bool ,
( std::is_same< ExecSpace , void >::value ||
std::is_same< ExecSpace
, typename Future<Arg1,Arg2>::execution_space >::value )
> {};
} // namespace Kokkos
//----------------------------------------------------------------------------
@ -319,18 +338,59 @@ public:
namespace Kokkos {
enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam
, TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
enum TaskPriority { TaskHighPriority = 0
, TaskRegularPriority = 1
, TaskLowPriority = 2 };
template< typename Space >
void wait( TaskScheduler< Space > const & );
enum class TaskPriority : int { High = 0
, Regular = 1
, Low = 2 };
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< int TaskEnum , typename DepFutureType >
struct TaskPolicyData
{
using execution_space = typename DepFutureType::execution_space ;
using scheduler_type = TaskScheduler< execution_space > ;
enum : int { m_task_type = TaskEnum };
scheduler_type const * m_scheduler ;
DepFutureType const m_dependence ;
int m_priority ;
TaskPolicyData() = delete ;
TaskPolicyData( TaskPolicyData && ) = default ;
TaskPolicyData( TaskPolicyData const & ) = default ;
TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicyData( DepFutureType && arg_future
, Kokkos::TaskPriority const & arg_priority )
: m_scheduler( 0 )
, m_dependence( arg_future )
, m_priority( static_cast<int>( arg_priority ) )
{}
KOKKOS_INLINE_FUNCTION
TaskPolicyData( scheduler_type const & arg_scheduler
, Kokkos::TaskPriority const & arg_priority )
: m_scheduler( & arg_scheduler )
, m_dependence()
, m_priority( static_cast<int>( arg_priority ) )
{}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
@ -348,52 +408,13 @@ private:
queue_type * m_queue ;
//----------------------------------------
// Process optional arguments to spawn and respawn functions
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const ) {}
// TaskTeam or TaskSingle
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskType const & arg
, Options const & ... opts )
{
task->m_task_type = arg ;
assign( task , opts ... );
}
// TaskHighPriority or TaskRegularPriority or TaskLowPriority
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskPriority const & arg
, Options const & ... opts )
{
task->m_priority = arg ;
assign( task , opts ... );
}
// Future for a dependence
template< typename A1 , typename A2 , typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, Future< A1 , A2 > const & arg
, Options const & ... opts )
{
task->add_dependence( arg.m_task );
assign( task , opts ... );
}
//----------------------------------------
public:
using execution_policy = TaskScheduler ;
using execution_space = ExecSpace ;
using memory_space = typename queue_type::memory_space ;
using member_type = Kokkos::Impl::TaskExec< ExecSpace > ;
using member_type =
typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
KOKKOS_INLINE_FUNCTION
TaskScheduler() : m_track(), m_queue(0) {}
@ -460,18 +481,13 @@ public:
//----------------------------------------
/**\brief A task spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
KOKKOS_FUNCTION
Future< typename FunctorType::value_type , ExecSpace >
task_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
template< int TaskEnum , typename DepFutureType , typename FunctorType >
KOKKOS_FUNCTION static
Kokkos::Future< typename FunctorType::value_type , execution_space >
spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
, typename task_base::function_type arg_function
, FunctorType && arg_functor
)
{
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
@ -479,11 +495,21 @@ public:
, value_type
, FunctorType > ;
queue_type * const queue =
arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
arg_policy.m_dependence.m_task
? arg_policy.m_dependence.m_task->m_queue
: (queue_type*) 0 );
if ( 0 == queue ) {
Kokkos::abort("Kokkos spawn given null Future" );
}
//----------------------------------------
// Give single-thread back-ends an opportunity to clear
// queue of ready tasks before allocating a new task
m_queue->iff_single_thread_recursive_execute();
queue->iff_single_thread_recursive_execute();
//----------------------------------------
@ -491,176 +517,129 @@ public:
// Allocate task from memory pool
f.m_task =
reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
if ( f.m_task ) {
// Placement new construction
new ( f.m_task ) task_type( arg_functor );
// Reference count starts at two:
// +1 for the matching decrement when task is complete
// +1 for the future
new ( f.m_task )
task_type( arg_function
, queue
, arg_policy.m_dependence.m_task /* dependence */
, 2 /* reference count */
, int(sizeof(task_type)) /* allocation size */
, int(arg_policy.m_task_type)
, int(arg_policy.m_priority)
, std::move(arg_functor) );
// Reference count starts at two
// +1 for matching decrement when task is complete
// +1 for future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
// The dependence (if any) is processed immediately
// within the schedule function, as such the dependence's
// reference count does not need to be incremented for
// the assignment.
assign( f.m_task , arg_options... );
// Spawning from within the execution space so the
// apply function pointer is guaranteed to be valid
f.m_task->m_apply = task_type::apply ;
m_queue->schedule( f.m_task );
// this task may be updated or executed at any moment
queue->schedule_runnable( f.m_task );
// This task may be updated or executed at any moment,
// even during the call to 'schedule'.
}
return f ;
}
/**\brief The host process spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
inline
Future< typename FunctorType::value_type , ExecSpace >
host_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
template< typename FunctorType , typename A1 , typename A2 >
KOKKOS_FUNCTION static
void
respawn( FunctorType * arg_self
, Future<A1,A2> const & arg_dependence
, TaskPriority const & arg_priority
)
{
// Precondition: task is in Executing state
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
if ( m_queue == 0 ) {
Kokkos::abort("Kokkos::TaskScheduler not initialized");
}
future_type f ;
// Allocate task from memory pool
f.m_task =
reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
if ( f.m_task ) {
// Placement new construction
new( f.m_task ) task_type( arg_functor );
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
assign( f.m_task , arg_options... );
// Potentially spawning outside execution space so the
// apply function pointer must be obtained from execution space.
// Required for Cuda execution space function pointer.
m_queue->template proc_set_apply< FunctorType >( & f.m_task->m_apply );
m_queue->schedule( f.m_task );
}
return f ;
task_type * const task = static_cast< task_type * >( arg_self );
task->m_priority = static_cast<int>(arg_priority);
task->add_dependence( arg_dependence.m_task );
// Postcondition: task is in Executing-Respawn state
}
//----------------------------------------
/**\brief Return a future that is complete
* when all input futures are complete.
*/
template< typename A1 , typename A2 >
KOKKOS_FUNCTION
Future< ExecSpace >
when_all( int narg , Future< A1 , A2 > const * const arg ) const
KOKKOS_FUNCTION static
Future< execution_space >
when_all( Future< A1 , A2 > const arg[] , int narg )
{
static_assert
( std::is_same< execution_space
, typename Future< A1 , A2 >::execution_space
>::value
, "Future must have same execution space" );
using future_type = Future< ExecSpace > ;
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
using future_type = Future< execution_space > ;
using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ;
future_type f ;
if ( narg ) {
queue_type * queue = 0 ;
for ( int i = 0 ; i < narg ; ++i ) {
task_base * const t = arg[i].m_task ;
if ( 0 != t ) {
// Increment reference count to track subsequent assignment.
Kokkos::atomic_increment( &(t->m_ref_count) );
if ( queue == 0 ) {
queue = t->m_queue ;
}
else if ( queue != t->m_queue ) {
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
}
}
}
if ( queue != 0 ) {
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
f.m_task =
reinterpret_cast< task_base * >( m_queue->allocate( size ) );
reinterpret_cast< task_base * >( queue->allocate( size ) );
if ( f.m_task ) {
new( f.m_task ) task_base();
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = size ;
f.m_task->m_dep_count = narg ;
f.m_task->m_task_type = task_base::Aggregate ;
new( f.m_task ) task_base( queue
, 2 /* reference count */
, size /* allocation size */
, narg /* dependence count */
);
// Assign dependences, reference counts were already incremented
task_base ** const dep = f.m_task->aggregate_dependences();
// Assign dependences to increment their reference count
// The futures may be destroyed upon returning from this call
// so increment reference count to track this assignment.
for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
for ( int i = 0 ; i < narg ; ++i ) {
task_base * const t = dep[i] = arg[i].m_task ;
if ( 0 != t ) {
Kokkos::atomic_increment( &(t->m_ref_count) );
}
}
m_queue->schedule( f.m_task );
queue->schedule_aggregate( f.m_task );
// this when_all may be processed at any moment
}
}
}
return f ;
}
/**\brief An executing task respawns itself with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
*/
template< class FunctorType , typename ... Options >
KOKKOS_FUNCTION
void respawn( FunctorType * task_self
, Options const & ... arg_options ) const
{
using value_type = typename FunctorType::value_type ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
task_type * const task = static_cast< task_type * >( task_self );
// Reschedule task with no dependences.
m_queue->reschedule( task );
// Dependences, if requested, are added here through parsing the arguments.
assign( task , arg_options... );
}
//----------------------------------------
template< typename S >
friend
void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
//----------------------------------------
inline
KOKKOS_INLINE_FUNCTION
int allocation_capacity() const noexcept
{ return m_queue->m_memory.get_mem_size(); }
@ -676,12 +655,192 @@ public:
long allocated_task_count_accum() const noexcept
{ return m_queue->m_accum_alloc ; }
//----------------------------------------
template< typename S >
friend
void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
//----------------------------------------------------------------------------
// Construct a TaskTeam execution policy
template< typename T >
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
, typename std::conditional< Kokkos::is_future< T >::value , T ,
typename Kokkos::Future< typename T::execution_space > >::type
>
KOKKOS_INLINE_FUNCTION
TaskTeam( T const & arg
, TaskPriority const & arg_priority = TaskPriority::Regular
)
{
static_assert( Kokkos::is_future<T>::value ||
Kokkos::is_scheduler<T>::value
, "Kokkos TaskTeam argument must be Future or TaskScheduler" );
return
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
, typename std::conditional< Kokkos::is_future< T >::value , T ,
typename Kokkos::Future< typename T::execution_space > >::type
>( arg , arg_priority );
}
// Construct a TaskSingle execution policy
template< typename T >
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
, typename std::conditional< Kokkos::is_future< T >::value , T ,
typename Kokkos::Future< typename T::execution_space > >::type
>
KOKKOS_INLINE_FUNCTION
TaskSingle( T const & arg
, TaskPriority const & arg_priority = TaskPriority::Regular
)
{
static_assert( Kokkos::is_future<T>::value ||
Kokkos::is_scheduler<T>::value
, "Kokkos TaskSingle argument must be Future or TaskScheduler" );
return
Kokkos::Impl::TaskPolicyData
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
, typename std::conditional< Kokkos::is_future< T >::value , T ,
typename Kokkos::Future< typename T::execution_space > >::type
>( arg , arg_priority );
}
//----------------------------------------------------------------------------
/**\brief A host control thread spawns a task with options
*
* 1) Team or Serial
* 2) With scheduler or dependence
* 3) High, Normal, or Low priority
*/
template< int TaskEnum
, typename DepFutureType
, typename FunctorType >
Future< typename FunctorType::value_type
, typename DepFutureType::execution_space >
host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
, FunctorType && arg_functor
)
{
using exec_space = typename DepFutureType::execution_space ;
using scheduler = TaskScheduler< exec_space > ;
typedef Impl::TaskBase< exec_space
, typename FunctorType::value_type
, FunctorType
> task_type ;
static_assert( TaskEnum == task_type::TaskTeam ||
TaskEnum == task_type::TaskSingle
, "Kokkos host_spawn requires TaskTeam or TaskSingle" );
// May be spawning a Cuda task, must use the specialization
// to query on-device function pointer.
typename task_type::function_type const ptr =
Kokkos::Impl::TaskQueueSpecialization< exec_space >::
template get_function_pointer< task_type >();
return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
}
/**\brief A task spawns a task with options
*
* 1) Team or Serial
* 2) With scheduler or dependence
* 3) High, Normal, or Low priority
*/
template< int TaskEnum
, typename DepFutureType
, typename FunctorType >
Future< typename FunctorType::value_type
, typename DepFutureType::execution_space >
KOKKOS_INLINE_FUNCTION
task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
, FunctorType && arg_functor
)
{
using exec_space = typename DepFutureType::execution_space ;
using scheduler = TaskScheduler< exec_space > ;
typedef Impl::TaskBase< exec_space
, typename FunctorType::value_type
, FunctorType
> task_type ;
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
defined( KOKKOS_ENABLE_CUDA )
static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
, "Error calling Kokkos::task_spawn for Cuda space within Host code" );
#endif
static_assert( TaskEnum == task_type::TaskTeam ||
TaskEnum == task_type::TaskSingle
, "Kokkos host_spawn requires TaskTeam or TaskSingle" );
typename task_type::function_type const ptr = task_type::apply ;
return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
}
/**\brief A task respawns itself with options
*
* 1) With scheduler or dependence
* 2) High, Normal, or Low priority
*/
template< typename FunctorType , typename T >
void
KOKKOS_INLINE_FUNCTION
respawn( FunctorType * arg_self
, T const & arg
, TaskPriority const & arg_priority = TaskPriority::Regular
)
{
static_assert( Kokkos::is_future<T>::value ||
Kokkos::is_scheduler<T>::value
, "Kokkos respawn argument must be Future or TaskScheduler" );
TaskScheduler< typename T::execution_space >::
respawn( arg_self , arg , arg_priority );
}
//----------------------------------------------------------------------------
template< typename A1 , typename A2 >
KOKKOS_INLINE_FUNCTION
Future< typename Future< A1 , A2 >::execution_space >
when_all( Future< A1 , A2 > const arg[]
, int narg
)
{
return TaskScheduler< typename Future<A1,A2>::execution_space >::
when_all( arg , narg );
}
//----------------------------------------------------------------------------
// Wait for all runnable tasks to complete
template< typename ExecSpace >
inline
void wait( TaskScheduler< ExecSpace > const & policy )
{ policy.m_queue->execute(); }
void wait( TaskScheduler< ExecSpace > const & scheduler )
{ scheduler.m_queue->execute(); }
} // namespace Kokkos

View File

@ -230,4 +230,3 @@ struct VerifyExecutionCanAccessMemorySpace
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
#endif /* #define KOKKOS_THREADS_HPP */

View File

@ -40,9 +40,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
CONDITIONAL_COPIES += copy-threads
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
CONDITIONAL_COPIES += copy-qthread
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
CONDITIONAL_COPIES += copy-qthreads
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@ -60,6 +60,12 @@ ifeq ($(KOKKOS_OS),Darwin)
COPY_FLAG =
endif
ifeq ($(KOKKOS_DEBUG),"no")
KOKKOS_DEBUG_CMAKE = OFF
else
KOKKOS_DEBUG_CMAKE = ON
endif
messages:
echo "Start Build"
@ -91,6 +97,7 @@ build-makefile-kokkos:
echo "" >> Makefile.kokkos
echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos
echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos
echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos
echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos
echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos
echo "" >> Makefile.kokkos
@ -107,7 +114,55 @@ build-makefile-kokkos:
> Makefile.kokkos.tmp
mv -f Makefile.kokkos.tmp Makefile.kokkos
build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
build-cmake-kokkos:
rm -f kokkos.cmake
echo "#Global Settings used to generate this library" >> kokkos.cmake
echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake
echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake
echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake
echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake
echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake
echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake
echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake
echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake
echo "if(NOT $ENV{CXX})" >> kokkos.cmake
echo ' message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake
echo "endif()" >> kokkos.cmake
echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake
echo " set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake
echo "else()" >> kokkos.cmake
echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
echo "endif()" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "#Variables used in application Makefiles" >> kokkos.cmake
echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
echo "" >> kokkos.cmake
echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake
echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake
echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake
echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake
echo "" >> kokkos.cmake
sed \
-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \
> kokkos.cmake.tmp
mv -f kokkos.cmake.tmp kokkos.cmake
build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
mkdir:
mkdir -p $(PREFIX)
@ -124,9 +179,9 @@ copy-threads: mkdir
mkdir -p $(PREFIX)/include/Threads
cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
copy-qthread: mkdir
mkdir -p $(PREFIX)/include/Qthread
cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
copy-qthreads: mkdir
mkdir -p $(PREFIX)/include/Qthreads
cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads
copy-openmp: mkdir
mkdir -p $(PREFIX)/include/OpenMP
@ -137,6 +192,7 @@ install: mkdir $(CONDITIONAL_COPIES) build-lib
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
cp $(COPY_FLAG) Makefile.kokkos $(PREFIX)
cp $(COPY_FLAG) kokkos.cmake $(PREFIX)
cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include

View File

@ -46,7 +46,6 @@
#include <omp.h>
#include <iostream>
#include <Kokkos_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
@ -107,58 +106,41 @@ private:
public:
inline void execute() const {
this->template execute_schedule<typename Policy::schedule_type::type>();
}
template<class Schedule>
inline
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
execute_schedule() const
inline void execute() const
{
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
}
/* END #pragma omp parallel */
if ( is_dynamic ) {
// Make sure work partition is set before stealing
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
template<class Schedule>
inline
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
execute_schedule() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
std::pair<int64_t,int64_t> range(0,0);
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
do {
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
exec.reset_steal_target();
#pragma omp barrier
ParallelFor::template
exec_range< WorkTag >( m_functor
, range.first + m_policy.begin()
, range.second + m_policy.begin() );
long work_index = exec.get_work_index();
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
work_index = exec.get_work_index();
} while ( is_dynamic && 0 <= range.first );
}
}
/* END #pragma omp parallel */
// END #pragma omp parallel
}
inline
@ -193,17 +175,18 @@ private:
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -247,92 +230,70 @@ private:
public:
inline void execute() const {
this->template execute_schedule<typename Policy::schedule_type::type>();
}
template<class Schedule>
inline
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
execute_schedule() const
inline void execute() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
, Kokkos::Dynamic >::value };
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
const size_t pool_reduce_bytes =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
OpenMPexec::resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
);
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
}
/* END #pragma omp parallel */
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
// Reduction:
data.set_work_partition( m_policy.end() - m_policy.begin()
, m_policy.chunk_size() );
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
if ( is_dynamic ) {
// Make sure work partition is set before stealing
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
, data.pool_reduce_local() );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
std::pair<int64_t,int64_t> range(0,0);
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
}
do {
template<class Schedule>
inline
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
execute_schedule() const
{
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
exec.reset_steal_target();
#pragma omp barrier
long work_index = exec.get_work_index();
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
ParallelReduce::template exec_range< WorkTag >
( m_functor , begin,end
ParallelReduce::template
exec_range< WorkTag >( m_functor
, range.first + m_policy.begin()
, range.second + m_policy.begin()
, update );
work_index = exec.get_work_index();
} while ( is_dynamic && 0 <= range.first );
}
}
/* END #pragma omp parallel */
// END #pragma omp parallel
// Reduction:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
@ -394,17 +355,18 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -452,53 +414,63 @@ public:
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
const int value_count = Analysis::value_count( m_functor );
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
OpenMPexec::resize_thread_data( pool_reduce_bytes
, 0 // team_reduce_bytes
, 0 // team_shared_bytes
, 0 // thread_local_bytes
);
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr =
pointer_type( exec.scratch_reduce() ) +
ValueTraits::value_count( m_functor );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , ptr ) , false );
}
/* END #pragma omp parallel */
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
{
const unsigned thread_count = OpenMPexec::pool_size();
const unsigned value_count = ValueTraits::value_count( m_functor );
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
reference_type update_sum =
ValueInit::init( m_functor , data.pool_reduce_local() );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end() , update_sum , false );
if ( data.pool_rendezvous() ) {
pointer_type ptr_prev = 0 ;
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
const int n = data.pool_size();
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
for ( int i = 0 ; i < n ; ++i ) {
if ( ptr_prev ) {
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
ValueJoin::join( m_functor , ptr + value_count , ptr );
pointer_type ptr = (pointer_type)
data.pool_member(i)->pool_reduce_local();
if ( i ) {
for ( int j = 0 ; j < value_count ; ++j ) {
ptr[j+value_count] = ptr_prev[j+value_count] ;
}
ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
}
else {
ValueInit::init( m_functor , ptr );
ValueInit::init( m_functor , ptr + value_count );
}
ptr_prev = ptr ;
}
data.pool_rendezvous_release();
}
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
reference_type update_base =
ValueOps::reference
( ((pointer_type)data.pool_reduce_local()) + value_count );
ParallelScan::template exec_range< WorkTag >
( m_functor , range.begin() , range.end()
, ValueOps::reference( ptr ) , true );
( m_functor , range.begin() , range.end() , update_base , true );
}
/* END #pragma omp parallel */
}
//----------------------------------------
@ -530,55 +502,59 @@ class ParallelFor< FunctorType
{
private:
enum { TEAM_REDUCE_SIZE = 512 };
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::schedule_type::type SchedTag ;
typedef typename Policy::member_type Member ;
const FunctorType m_functor ;
const Policy m_policy ;
const int m_shmem_size ;
template< class TagType, class Schedule >
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
exec_team( const FunctorType & functor , Member member )
typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
exec_team( const FunctorType & functor
, HostThreadTeamData & data
, const int league_rank_begin
, const int league_rank_end
, const int league_size )
{
for ( ; member.valid_static() ; member.next_static() ) {
functor( member );
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
functor( Member( data, r , league_size ) );
if ( ++r < league_rank_end ) {
// Don't allow team members to lap one another
// so that they don't overwrite shared memory.
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
}
}
}
template< class TagType, class Schedule >
inline static
typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
exec_team( const FunctorType & functor , Member member )
{
const TagType t{} ;
for ( ; member.valid_static() ; member.next_static() ) {
functor( t , member );
}
}
template< class TagType, class Schedule >
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
exec_team( const FunctorType & functor , Member member )
typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
exec_team( const FunctorType & functor
, HostThreadTeamData & data
, const int league_rank_begin
, const int league_rank_end
, const int league_size )
{
#pragma omp barrier
for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
functor( member );
}
}
const TagType t{};
template< class TagType, class Schedule >
inline static
typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
exec_team( const FunctorType & functor , Member member )
{
#pragma omp barrier
const TagType t{} ;
for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
functor( t , member );
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
functor( t , Member( data, r , league_size ) );
if ( ++r < league_rank_end ) {
// Don't allow team members to lap one another
// so that they don't overwrite shared memory.
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
}
}
}
@ -587,31 +563,75 @@ public:
inline
void execute() const
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t pool_reduce_size = 0 ; // Never shrinks
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
OpenMPexec::resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
{
ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
( m_functor
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
const int active = data.organize_team( m_policy.team_size() );
if ( active ) {
data.set_work_partition( m_policy.league_size()
, ( 0 < m_policy.chunk_size()
? m_policy.chunk_size()
: m_policy.team_iter() ) );
}
/* END #pragma omp parallel */
if ( is_dynamic ) {
// Must synchronize to make sure each team has set its
// partition before begining the work stealing loop.
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
if ( active ) {
std::pair<int64_t,int64_t> range(0,0);
do {
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
ParallelFor::template exec_team< WorkTag >
( m_functor , data
, range.first , range.second , m_policy.league_size() );
} while ( is_dynamic && 0 <= range.first );
}
data.disband_team();
}
// END #pragma omp parallel
}
inline
ParallelFor( const FunctorType & arg_functor ,
const Policy & arg_policy )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shmem_size( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >
::value( arg_functor , arg_policy.team_size() ) )
{}
};
//----------------------------------------------------------------------------
template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType
@ -622,20 +642,26 @@ class ParallelReduce< FunctorType
{
private:
enum { TEAM_REDUCE_SIZE = 512 };
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::schedule_type::type SchedTag ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ;
typedef typename Analysis::pointer_type pointer_type ;
typedef typename Analysis::reference_type reference_type ;
const FunctorType m_functor ;
const Policy m_policy ;
@ -645,22 +671,48 @@ private:
template< class TagType >
inline static
typename std::enable_if< std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
exec_team( const FunctorType & functor
, HostThreadTeamData & data
, reference_type & update
, const int league_rank_begin
, const int league_rank_end
, const int league_size )
{
for ( ; member.valid_static() ; member.next_static() ) {
functor( member , update );
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
functor( Member( data, r , league_size ) , update );
if ( ++r < league_rank_end ) {
// Don't allow team members to lap one another
// so that they don't overwrite shared memory.
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
}
}
}
template< class TagType >
inline static
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
exec_team( const FunctorType & functor , Member member , reference_type update )
typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
exec_team( const FunctorType & functor
, HostThreadTeamData & data
, reference_type & update
, const int league_rank_begin
, const int league_rank_end
, const int league_size )
{
const TagType t{} ;
for ( ; member.valid_static() ; member.next_static() ) {
functor( t , member , update );
const TagType t{};
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
functor( t , Member( data, r , league_size ) , update );
if ( ++r < league_rank_end ) {
// Don't allow team members to lap one another
// so that they don't overwrite shared memory.
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
}
}
}
@ -669,43 +721,88 @@ public:
inline
void execute() const
{
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
const size_t pool_reduce_size =
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
const size_t thread_local_size = 0 ; // Never shrinks
OpenMPexec::resize_thread_data( pool_reduce_size
, team_reduce_size
, team_shared_size
, thread_local_size );
#pragma omp parallel
{
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
const int active = data.organize_team( m_policy.team_size() );
if ( active ) {
data.set_work_partition( m_policy.league_size()
, ( 0 < m_policy.chunk_size()
? m_policy.chunk_size()
: m_policy.team_iter() ) );
}
if ( is_dynamic ) {
// Must synchronize to make sure each team has set its
// partition before begining the work stealing loop.
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
}
if ( active ) {
reference_type update =
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
, data.pool_reduce_local() );
std::pair<int64_t,int64_t> range(0,0);
do {
range = is_dynamic ? data.get_work_stealing_chunk()
: data.get_work_partition();
ParallelReduce::template exec_team< WorkTag >
( m_functor
, Member( exec , m_policy , m_shmem_size, 0 )
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
( m_functor , data , update
, range.first , range.second , m_policy.league_size() );
} while ( is_dynamic && 0 <= range.first );
} else {
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
, data.pool_reduce_local() );
}
/* END #pragma omp parallel */
{
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
data.disband_team();
}
// END #pragma omp parallel
int max_active_threads = OpenMPexec::pool_size();
if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
max_active_threads = m_policy.league_size()* m_policy.team_size();
// Reduction:
for ( int i = 1 ; i < max_active_threads ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
, ptr
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
}
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) {
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
}
}
}
//----------------------------------------
template< class ViewType >
inline
@ -720,7 +817,10 @@ public:
, m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shmem_size( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >
::value( arg_functor , arg_policy.team_size() ) )
{}
inline
@ -731,7 +831,10 @@ public:
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
, m_shmem_size( arg_policy.scratch_size(0) +
arg_policy.scratch_size(1) +
FunctorTeamShmemSize< FunctorType >
::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value

View File

@ -46,6 +46,7 @@
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
#include <impl/Kokkos_TaskQueue_impl.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
@ -55,105 +56,46 @@ namespace Impl {
template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
class HostThreadTeamDataSingleton : private HostThreadTeamData {
private:
TaskExec< Kokkos::OpenMP >::
TaskExec()
: m_self_exec( 0 )
, m_team_exec( 0 )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( 0 )
, m_team_rank( 0 )
, m_team_size( 1 )
{
}
HostThreadTeamDataSingleton() : HostThreadTeamData()
{
Kokkos::OpenMP::memory_space space ;
const size_t num_pool_reduce_bytes = 32 ;
const size_t num_team_reduce_bytes = 32 ;
const size_t num_team_shared_bytes = 1024 ;
const size_t num_thread_local_bytes = 1024 ;
const size_t alloc_bytes =
HostThreadTeamData::scratch_size( num_pool_reduce_bytes
, num_team_reduce_bytes
, num_team_shared_bytes
, num_thread_local_bytes );
TaskExec< Kokkos::OpenMP >::
TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
: m_self_exec( & arg_exec )
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
, m_team_size( arg_team_size )
{
// This team spans
// m_self_exec->pool_rev( team_size * group_rank )
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
sync[0] = int64_t(0) ;
sync[1] = int64_t(0) ;
for ( int i = 0 ; i < m_team_size ; ++i ) {
m_sync_value |= int64_t(1) << (8*i);
m_sync_mask |= int64_t(3) << (8*i);
HostThreadTeamData::scratch_assign
( space.allocate( alloc_bytes )
, alloc_bytes
, num_pool_reduce_bytes
, num_team_reduce_bytes
, num_team_shared_bytes
, num_thread_local_bytes );
}
Kokkos::memory_fence();
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
{
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
~HostThreadTeamDataSingleton()
{
Kokkos::OpenMP::memory_space space ;
space.deallocate( HostThreadTeamData::scratch_buffer()
, HostThreadTeamData::scratch_bytes() );
}
// Use team shared memory to synchronize.
// Alternate memory locations between barriers to avoid a sequence
// of barriers overtaking one another.
public:
int64_t volatile * const sync =
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
// This team member sets one byte within the sync variable
int8_t volatile * const sync_self =
((int8_t *) sync) + m_team_rank ;
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
while ( m_sync_value != *sync ); // wait for team to arrive
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
++m_sync_step ;
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
m_sync_value ^= m_sync_mask ;
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
static HostThreadTeamData & singleton()
{
static HostThreadTeamDataSingleton s ;
return s ;
}
}
#endif
};
//----------------------------------------------------------------------------
@ -163,123 +105,165 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
static task_root_type * const end =
(task_root_type *) task_root_type::EndTag ;
// Required: team_size <= 8
HostThreadTeamData & team_data_single =
HostThreadTeamDataSingleton::singleton();
const int team_size = PoolExec::pool_size(2); // Threads per core
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
// const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
#if 0
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
fflush(stdout);
#endif
if ( 8 < team_size ) {
Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
}
#pragma omp parallel
{
PoolExec & self = *PoolExec::get_thread_omp();
Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
Member single_exec ;
Member team_exec( self , team_size );
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
// rendezvous mechanism before a team rendezvous can be performed.
// Team shared memory
task_root_type * volatile * const task_shared =
(task_root_type **) team_exec.m_team_exec->scratch_thread();
if ( self.organize_team( team_size ) ) {
// Barrier across entire OpenMP thread pool to insure initialization
#pragma omp barrier
Member single_exec( team_data_single );
Member team_exec( self );
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
);
fflush(stdout);
#endif
// Loop until all queues are empty and no tasks in flight
do {
task_root_type * task = 0 ;
do {
// Each team lead attempts to acquire either a thread team task
// or a single thread task for the team.
if ( 0 == team_exec.team_rank() ) {
bool leader_loop = false ;
do {
if ( 0 != task && end != task ) {
// team member #0 completes the previously executed task,
// completion may delete the task
queue->complete( task );
}
// If 0 == m_ready_count then set task = 0
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Attempt to acquire a task
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}
// Team lead broadcast acquired task to team members:
// If still tasks are still executing
// and no task could be acquired
// then continue this leader loop
leader_loop = end == task ;
if ( 1 < team_exec.team_size() ) {
if ( ( ! leader_loop ) &&
( 0 != task ) &&
( task_root_type::TaskSingle == task->m_task_type ) ) {
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
// Fence to be sure task_shared is stored before the barrier
Kokkos::memory_fence();
// Whole team waits for every team member to reach this statement
team_exec.team_barrier();
// Fence to be sure task_shared is stored
Kokkos::memory_fence();
task = *task_shared ;
}
// if a single thread task then execute now
#if 0
fprintf( stdout
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
, team_exec.m_group_rank
, team_exec.m_team_rank
, uintptr_t(task_shared)
, uintptr_t(task)
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
, self.pool_rank()
, self.pool_size()
, int64_t(task)
);
fflush(stdout);
#endif
if ( 0 == task ) break ; // 0 == m_ready_count
(*task->m_apply)( task , & single_exec );
if ( end == task ) {
// All team members wait for whole team to reach this statement.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
leader_loop = true ;
}
else if ( task_root_type::TaskTeam == task->m_task_type ) {
// Thread Team Task
} while ( leader_loop );
}
// Team lead either found 0 == m_ready_count or a team task
// Team lead broadcast acquired task:
team_exec.team_broadcast( task , 0);
if ( 0 != task ) { // Thread Team Task
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
, int64_t(task)
);
fflush(stdout);
#endif
(*task->m_apply)( task , & team_exec );
// The m_apply function performs a barrier
if ( 0 == team_exec.team_rank() ) {
// team member #0 completes the task, which may delete the task
queue->complete( task );
}
}
else {
// Single Thread Task
} while( 0 != task );
if ( 0 == team_exec.team_rank() ) {
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
, self.pool_rank()
, self.pool_size()
, team_exec.team_rank()
, team_exec.team_size()
, team_exec.league_rank()
, team_exec.league_size()
);
fflush(stdout);
#endif
(*task->m_apply)( task , & single_exec );
queue->complete( task );
}
// All team members wait for whole team to reach this statement.
// Not necessary to complete the task.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
} while(1);
self.disband_team();
#if 0
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
, self.pool_rank()
, self.pool_size()
);
fflush(stdout);
#endif
}
// END #pragma omp parallel
#if 0
fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
fflush(stdout);
#endif
}
void TaskQueueSpecialization< Kokkos::OpenMP >::
@ -289,13 +273,16 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec ;
HostThreadTeamData & team_data_single =
HostThreadTeamDataSingleton::singleton();
Member single_exec( team_data_single );
task_root_type * task = end ;
@ -306,7 +293,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}

View File

@ -60,6 +60,7 @@ public:
using execution_space = Kokkos::OpenMP ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
@ -70,296 +71,19 @@ public:
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
template< typename TaskType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::OpenMP
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
typename TaskType::function_type
get_function_pointer() { return TaskType::apply ; }
};
extern template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::OpenMP >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
int64_t m_sync_mask ;
int64_t mutable m_sync_value ;
int mutable m_sync_step ;
int m_group_rank ; ///< Which "team" subset of thread pool
int m_team_rank ; ///< Which thread within a team
int m_team_size ;
TaskExec();
TaskExec( PoolExec & arg_exec , int arg_team_size );
void team_barrier_impl() const ;
public:
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void * team_shared() const
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
int team_shared_size() const
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
/**\brief Whole team enters this function call
* before any teeam member returns from
* this function call.
*/
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
#else
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
#endif
KOKKOS_INLINE_FUNCTION
int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION
int team_size() const { return m_team_size ; }
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl::TaskExec< Kokkos::OpenMP > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
}
template<typename iType1, typename iType2>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl:: TaskExec< Kokkos::OpenMP > & thread, const iType1 & begin, const iType2 & end )
{
typedef typename std::common_type<iType1, iType2>::type iType;
return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::OpenMP > >(thread, begin, end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
ThreadVectorRange
( Impl::TaskExec< Kokkos::OpenMP > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
template<typename iType, class Lambda, typename ValueType>
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
, ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
shared[0] += shared[i];
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
join(shared[0], shared[i]);
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
int team_size = loop_boundaries.thread.team_size();
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
// Intra-member scan
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
shared[team_rank] = accum;
loop_boundaries.thread.team_barrier();
// Member 0 do scan on accumulated totals
if (team_rank == 0) {
for( iType i = 1; i < team_size; i+=1) {
shared[i] += shared[i-1];
}
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
}
loop_boundaries.thread.team_barrier();
// Inter-member scan adding in accumulated totals
if (team_rank != 0) { accum = shared[team_rank-1]; }
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */

View File

@ -86,7 +86,7 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
void OpenMPexec::verify_is_process( const char * const label )
{
@ -113,67 +113,110 @@ void OpenMPexec::verify_initialized( const char * const label )
}
void OpenMPexec::clear_scratch()
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void OpenMPexec::clear_thread_data()
{
const size_t member_bytes =
sizeof(int64_t) *
HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
const int old_alloc_bytes =
m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
Kokkos::HostSpace space ;
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_pool[ rank_rev ] ) {
Record * const r = Record::get_record( m_pool[ rank_rev ] );
m_pool[ rank_rev ] = 0 ;
Record::decrement( r );
const int rank = m_map_rank[ omp_get_thread_num() ];
if ( 0 != m_pool[rank] ) {
m_pool[rank]->disband_pool();
space.deallocate( m_pool[rank] , old_alloc_bytes );
m_pool[rank] = 0 ;
}
}
/* END #pragma omp parallel */
}
void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes )
{
enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
const size_t member_bytes =
sizeof(int64_t) *
HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
HostThreadTeamData * root = m_pool[0] ;
reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0 ;
const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0 ;
const size_t old_team_shared = root ? root->team_shared_bytes() : 0 ;
const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
const size_t old_alloc_bytes = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;
// Requesting allocation and old allocation is too small:
// Allocate if any of the old allocation is tool small:
const bool allocate = ( old_reduce_size < reduce_size ) ||
( old_thread_size < thread_size );
if ( allocate ) {
if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
}
const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
const int pool_size = m_pool_topo[0] ;
const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) ||
( old_team_reduce < team_reduce_bytes ) ||
( old_team_shared < team_shared_bytes ) ||
( old_thread_local < thread_local_bytes );
if ( allocate ) {
clear_scratch();
if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
const size_t alloc_bytes =
member_bytes +
HostThreadTeamData::scratch_size( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
const int pool_size = omp_get_max_threads();
Kokkos::HostSpace space ;
#pragma omp parallel
{
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
const int rank = pool_size - ( rank_rev + 1 );
const int rank = m_map_rank[ omp_get_thread_num() ];
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( 0 != m_pool[rank] ) {
Record * const r = Record::allocate( Kokkos::HostSpace()
, "openmp_scratch"
, alloc_size );
m_pool[rank]->disband_pool();
Record::increment( r );
space.deallocate( m_pool[rank] , old_alloc_bytes );
}
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
void * const ptr = space.allocate( alloc_bytes );
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
m_pool[ rank ] = new( ptr ) HostThreadTeamData();
m_pool[ rank ]->
scratch_assign( ((char *)ptr) + member_bytes
, alloc_bytes
, pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
}
/* END #pragma omp parallel */
HostThreadTeamData::organize_pool( m_pool , pool_size );
}
}
@ -197,14 +240,14 @@ void OpenMP::initialize( unsigned thread_count ,
// Before any other call to OMP query the maximum number of threads
// and save the value for re-initialization unit testing.
//Using omp_get_max_threads(); is problematic in conjunction with
//Hwloc on Intel (essentially an initial call to the OpenMP runtime
//without a parallel region before will set a process mask for a single core
//The runtime will than bind threads for a parallel region to other cores on the
//entering the first parallel region and make the process mask the aggregate of
//the thread masks. The intend seems to be to make serial code run fast, if you
//compile with OpenMP enabled but don't actually use parallel regions or so
//static int omp_max_threads = omp_get_max_threads();
// Using omp_get_max_threads(); is problematic in conjunction with
// Hwloc on Intel (essentially an initial call to the OpenMP runtime
// without a parallel region before will set a process mask for a single core
// The runtime will than bind threads for a parallel region to other cores on the
// entering the first parallel region and make the process mask the aggregate of
// the thread masks. The intend seems to be to make serial code run fast, if you
// compile with OpenMP enabled but don't actually use parallel regions or so
// static int omp_max_threads = omp_get_max_threads();
int nthreads = 0;
#pragma omp parallel
{
@ -268,8 +311,6 @@ void OpenMP::initialize( unsigned thread_count ,
// Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
// Call to 'new' may not be thread safe as well.
// Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
const unsigned omp_rank = omp_get_thread_num();
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
@ -286,7 +327,19 @@ void OpenMP::initialize( unsigned thread_count ,
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
// New, unified host thread team data:
{
size_t pool_reduce_bytes = 32 * thread_count ;
size_t team_reduce_bytes = 32 * thread_count ;
size_t team_shared_bytes = 1024 * thread_count ;
size_t thread_local_bytes = 1024 ;
Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes
);
}
}
}
@ -309,7 +362,7 @@ void OpenMP::initialize( unsigned thread_count ,
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
@ -321,7 +374,8 @@ void OpenMP::finalize()
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
Impl::OpenMPexec::clear_scratch();
// New, unified host thread team data:
Impl::OpenMPexec::clear_thread_data();
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
@ -333,7 +387,7 @@ void OpenMP::finalize()
hwloc::unbind_this_thread();
}
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}

View File

@ -44,13 +44,22 @@
#ifndef KOKKOS_OPENMPEXEC_HPP
#define KOKKOS_OPENMPEXEC_HPP
#include <Kokkos_OpenMP.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <Kokkos_Atomic.hpp>
#include <iostream>
#include <sstream>
#include <fstream>
#include <omp.h>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
@ -60,41 +69,19 @@ namespace Impl {
class OpenMPexec {
public:
friend class Kokkos::OpenMP ;
enum { MAX_THREAD_COUNT = 4096 };
private:
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ];
friend class Kokkos::OpenMP ;
static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
int const m_pool_rank ;
int const m_pool_rank_rev ;
int const m_scratch_exec_end ;
int const m_scratch_reduce_end ;
int const m_scratch_thread_end ;
int volatile m_barrier_state ;
// Members for dynamic scheduling
// Which thread am I stealing from currently
int m_current_steal_target;
// This thread's owned work_range
Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
// Team Offset if one thread determines work_range for others
long m_team_work_index;
// Is this thread stealing (i.e. its owned work_range is exhausted
bool m_stealing;
OpenMPexec();
OpenMPexec( const OpenMPexec & );
OpenMPexec & operator = ( const OpenMPexec & );
static void clear_scratch();
static
void clear_thread_data();
public:
@ -108,44 +95,6 @@ public:
inline static
int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
inline static
OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
inline int pool_rank() const { return m_pool_rank ; }
inline int pool_rank_rev() const { return m_pool_rank_rev ; }
inline long team_work_index() const { return m_team_work_index ; }
inline int scratch_reduce_size() const
{ return m_scratch_reduce_end - m_scratch_exec_end ; }
inline int scratch_thread_size() const
{ return m_scratch_thread_end - m_scratch_reduce_end ; }
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
inline
void state_wait( int state )
{ Impl::spinwait( m_barrier_state , state ); }
inline
void state_set( int state ) { m_barrier_state = state ; }
~OpenMPexec() {}
OpenMPexec( const int arg_poolRank
, const int arg_scratch_exec_size
, const int arg_scratch_reduce_size
, const int arg_scratch_thread_size )
: m_pool_rank( arg_poolRank )
, m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
, m_scratch_exec_end( arg_scratch_exec_size )
, m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size )
, m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
, m_barrier_state(0)
{}
static void finalize();
static void initialize( const unsigned team_count ,
@ -156,133 +105,20 @@ public:
static void verify_is_process( const char * const );
static void verify_initialized( const char * const );
static void resize_scratch( size_t reduce_size , size_t thread_size );
static
void resize_thread_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes );
inline static
OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
/* Dynamic Scheduling related functionality */
// Initialize the work range for this thread
inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
m_work_range.first = (begin+chunk_size-1)/chunk_size;
m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
}
// Claim and index from this thread's range from the beginning
inline long get_work_index_begin () {
Kokkos::pair<long,long> work_range_new = m_work_range;
Kokkos::pair<long,long> work_range_old = work_range_new;
if(work_range_old.first>=work_range_old.second)
return -1;
work_range_new.first+=1;
bool success = false;
while(!success) {
work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
success = ( (work_range_new == work_range_old) ||
(work_range_new.first>=work_range_new.second));
work_range_old = work_range_new;
work_range_new.first+=1;
}
if(work_range_old.first<work_range_old.second)
return work_range_old.first;
else
return -1;
}
// Claim and index from this thread's range from the end
inline long get_work_index_end () {
Kokkos::pair<long,long> work_range_new = m_work_range;
Kokkos::pair<long,long> work_range_old = work_range_new;
if(work_range_old.first>=work_range_old.second)
return -1;
work_range_new.second-=1;
bool success = false;
while(!success) {
work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
success = ( (work_range_new == work_range_old) ||
(work_range_new.first>=work_range_new.second) );
work_range_old = work_range_new;
work_range_new.second-=1;
}
if(work_range_old.first<work_range_old.second)
return work_range_old.second-1;
else
return -1;
}
// Reset the steal target
inline void reset_steal_target() {
m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
m_stealing = false;
}
// Reset the steal target
inline void reset_steal_target(int team_size) {
m_current_steal_target = (m_pool_rank_rev+team_size);
if(m_current_steal_target>=m_pool_topo[0])
m_current_steal_target = 0;//m_pool_topo[0]-1;
m_stealing = false;
}
// Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
// Returns -1 fi no active steal target available
inline int get_steal_target() {
while(( m_pool[m_current_steal_target]->m_work_range.second <=
m_pool[m_current_steal_target]->m_work_range.first ) &&
(m_current_steal_target!=m_pool_rank) ) {
m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
}
if(m_current_steal_target == m_pool_rank)
return -1;
else
return m_current_steal_target;
}
inline int get_steal_target(int team_size) {
while(( m_pool[m_current_steal_target]->m_work_range.second <=
m_pool[m_current_steal_target]->m_work_range.first ) &&
(m_current_steal_target!=m_pool_rank_rev) ) {
if(m_current_steal_target + team_size < m_pool_topo[0])
m_current_steal_target = (m_current_steal_target+team_size);
else
m_current_steal_target = 0;
}
if(m_current_steal_target == m_pool_rank_rev)
return -1;
else
return m_current_steal_target;
}
inline long steal_work_index (int team_size = 0) {
long index = -1;
int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
while ( (steal_target != -1) && (index == -1)) {
index = m_pool[steal_target]->get_work_index_end();
if(index == -1)
steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
}
return index;
}
// Get a work index. Claim from owned range until its exhausted, then steal from other thread
inline long get_work_index (int team_size = 0) {
long work_index = -1;
if(!m_stealing) work_index = get_work_index_begin();
if( work_index == -1) {
memory_fence();
m_stealing = true;
work_index = steal_work_index(team_size);
}
m_team_work_index = work_index;
memory_fence();
return work_index;
}
HostThreadTeamData * get_thread_data() noexcept
{ return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
inline static
HostThreadTeamData * get_thread_data( int i ) noexcept
{ return m_pool[i]; }
};
} // namespace Impl
@ -294,356 +130,6 @@ public:
namespace Kokkos {
namespace Impl {
class OpenMPexecTeamMember {
public:
enum { TEAM_REDUCE_SIZE = 512 };
/** \brief Thread states for team synchronization */
enum { Active = 0 , Rendezvous = 1 };
typedef Kokkos::OpenMP execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
Impl::OpenMPexec & m_exec ;
scratch_memory_space m_team_shared ;
int m_team_scratch_size[2] ;
int m_team_base_rev ;
int m_team_rank_rev ;
int m_team_rank ;
int m_team_size ;
int m_league_rank ;
int m_league_end ;
int m_league_size ;
int m_chunk_size;
int m_league_chunk_end;
Impl::OpenMPexec & m_team_lead_exec ;
int m_invalid_thread;
int m_team_alloc;
// Fan-in team threads, root of the fan-in which does not block returns true
inline
bool team_fan_in() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
}
if ( m_team_rank_rev ) {
m_exec.state_set( Rendezvous );
memory_fence();
m_exec.state_wait( Rendezvous );
}
return 0 == m_team_rank_rev ;
}
inline
void team_fan_out() const
{
memory_fence();
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
memory_fence();
}
}
public:
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_shmem() const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{}
#else
{
if ( 1 < m_team_size && !m_invalid_thread) {
team_fan_in();
team_fan_out();
}
}
#endif
template<class ValueType>
KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType& value, const int& thread_id) const
{
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ }
#else
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
, ValueType , void >::type type ;
type volatile * const shared_value =
((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
if ( team_rank() == thread_id ) *shared_value = value;
memory_fence();
team_barrier(); // Wait for 'thread_id' to write
value = *shared_value ;
team_barrier(); // Wait for team members to read
#endif
}
template< class ValueType, class JoinOp >
KOKKOS_INLINE_FUNCTION ValueType
team_reduce( const ValueType & value
, const JoinOp & op_in ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ValueType(); }
#else
{
memory_fence();
typedef ValueType value_type;
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
#endif
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
, value_type , void >::type type ;
type * const local_value = ((type*) m_exec.scratch_thread());
// Set this thread's contribution
*local_value = value ;
// Fence to make sure the base team member has access:
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
type * const team_value = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
// Join to the team value:
for ( int i = 1 ; i < m_team_size ; ++i ) {
op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
}
memory_fence();
// The base team member may "lap" the other team members,
// copy to their local value before proceeding.
for ( int i = 1 ; i < m_team_size ; ++i ) {
*((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
}
// Fence to make sure all team members have access
memory_fence();
}
team_fan_out();
return *((type volatile const *)local_value);
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename ArgType >
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return ArgType(); }
#else
{
// Make sure there is enough scratch space:
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
volatile type * const work_value = ((type*) m_exec.scratch_thread());
*work_value = value ;
memory_fence();
if ( team_fan_in() ) {
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
// m_team_base[0] == highest ranking team member
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
//
// 1) copy from lower to higher rank, initialize lowest rank to zero
// 2) prefix sum from lowest to highest rank, skipping lowest rank
type accum = 0 ;
if ( global_accum ) {
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
accum += val ;
}
accum = atomic_fetch_add( global_accum , accum );
}
for ( int i = m_team_size ; i-- ; ) {
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
const type offset = accum ;
accum += val ;
val = offset ;
}
memory_fence();
}
team_fan_out();
return *work_value ;
}
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
{ return this-> template team_scan<Type>( value , 0 ); }
//----------------------------------------
// Private for the driver
private:
typedef execution_space::scratch_memory_space space ;
public:
template< class ... Properties >
inline
OpenMPexecTeamMember( Impl::OpenMPexec & exec
, const TeamPolicyInternal< OpenMP, Properties ...> & team
, const int shmem_size_L1
, const int shmem_size_L2
)
: m_exec( exec )
, m_team_shared(0,0)
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
, m_team_base_rev(0)
, m_team_rank_rev(0)
, m_team_rank(0)
, m_team_size( team.team_size() )
, m_league_rank(0)
, m_league_end(0)
, m_league_size( team.league_size() )
, m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
, m_league_chunk_end(0)
, m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
, m_team_alloc( team.team_alloc())
{
const int pool_rank_rev = m_exec.pool_rank_rev();
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc();
const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size;
if (league_iter_begin < 0) league_iter_begin = 0;
if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
if ((team.team_alloc()>m_team_size)?
(pool_team_rank_rev >= m_team_size):
(m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
)
m_invalid_thread = 1;
else
m_invalid_thread = 0;
m_team_rank_rev = pool_team_rank_rev ;
if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
m_team_base_rev = team.team_alloc() * pool_league_rank_rev ;
m_team_rank_rev = pool_team_rank_rev ;
m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
m_league_end = league_iter_end ;
m_league_rank = league_iter_begin ;
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0 );
}
if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
m_exec.reset_steal_target(m_team_size);
}
}
bool valid_static() const
{
return m_league_rank < m_league_end ;
}
void next_static()
{
if ( m_league_rank < m_league_end ) {
team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
}
m_league_rank++;
}
bool valid_dynamic() {
if(m_invalid_thread)
return false;
if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
return true;
}
if ( m_team_rank_rev == 0 ) {
m_team_lead_exec.get_work_index(m_team_alloc);
}
team_barrier();
long work_index = m_team_lead_exec.team_work_index();
m_league_rank = work_index * m_chunk_size;
m_league_chunk_end = (work_index +1 ) * m_chunk_size;
if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
if(m_league_rank>=0)
return true;
return false;
}
void next_dynamic() {
if(m_invalid_thread)
return;
if ( m_league_rank < m_league_chunk_end ) {
team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
}
m_league_rank++;
}
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
};
template< class ... Properties >
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
{
@ -671,8 +157,11 @@ public:
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return traits::execution_space::thread_pool_size(1); }
int team_size_max( const FunctorType & ) {
int pool_size = traits::execution_space::thread_pool_size(1);
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
return pool_size<max_host_team_size?pool_size:max_host_team_size;
}
template< class FunctorType >
inline static
@ -702,7 +191,8 @@ private:
, const int team_size_request )
{
const int pool_size = traits::execution_space::thread_pool_size(0);
const int team_max = traits::execution_space::thread_pool_size(1);
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
const int team_grain = traits::execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
@ -823,7 +313,7 @@ private:
}
public:
typedef Impl::OpenMPexecTeamMember member_type ;
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
};
} // namespace Impl
@ -850,216 +340,6 @@ int OpenMP::thread_pool_rank()
#endif
}
template< typename iType >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >
TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType& count ) {
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, count );
}
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::OpenMPexecTeamMember >
TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType1& begin, const iType2& end ) {
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
}
} // namespace Kokkos
namespace Kokkos {
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = loop_boundaries.thread.team_reduce(result,join);
}
} //namespace Kokkos
namespace Kokkos {
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
}
}
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
* for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
* Depending on the target execution space the operator might be called twice: once with final=false
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
* to the final sum value over all vector lanes.
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename ValueTraits::value_type value_type ;
value_type scan_val = value_type();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,scan_val,true);
}
}
} // namespace Kokkos
namespace Kokkos {
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
single_struct.team_member.team_broadcast(val,0);
}
}
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */

View File

@ -1,511 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_QTHREAD )
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <utility>
#include <Kokkos_Qthread.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Error.hpp>
// Defines to enable experimental Qthread functionality
#define QTHREAD_LOCAL_PRIORITY
#define CLONED_TASKS
#include <qthread/qthread.h>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
/** s_exec is indexed by the reverse rank of the workers
* for faster fan-in / fan-out lookups
* [ n - 1 , n - 2 , ... , 0 ]
*/
QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
int s_number_shepherds = 0 ;
int s_number_workers_per_shepherd = 0 ;
int s_number_workers = 0 ;
inline
QthreadExec ** worker_exec()
{
return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
}
const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
int s_worker_reduce_end = 0 ; /* End of worker reduction memory */
int s_worker_shared_end = 0 ; /* Total of worker scratch memory */
int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
QthreadExecFunctionPointer volatile s_active_function = 0 ;
const void * volatile s_active_function_arg = 0 ;
} /* namespace */
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
int Qthread::is_initialized()
{
return Impl::s_number_workers != 0 ;
}
int Qthread::concurrency()
{
return Impl::s_number_workers_per_shepherd ;
}
int Qthread::in_parallel()
{
return Impl::s_active_function != 0 ;
}
void Qthread::initialize( int thread_count )
{
// Environment variable: QTHREAD_NUM_SHEPHERDS
// Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
// Environment variable: QTHREAD_HWPAR
{
char buffer[256];
snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
putenv(buffer);
}
const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
( thread_count == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
( thread_count == qthread_num_workers() );
bool ok_symmetry = true ;
if ( ok_init ) {
Impl::s_number_shepherds = qthread_num_shepherds();
Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
}
}
if ( ! ok_init || ! ok_symmetry ) {
std::ostringstream msg ;
msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
msg << " : qthread_num_workers = " << qthread_num_workers();
if ( ! ok_symmetry ) {
msg << " : qthread_num_workers_local = {" ;
for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
msg << " " << qthread_num_workers_local(i) ;
}
msg << " }" ;
}
Impl::s_number_workers = 0 ;
Impl::s_number_shepherds = 0 ;
Impl::s_number_workers_per_shepherd = 0 ;
if ( ok_init ) { qthread_finalize(); }
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
}
void Qthread::finalize()
{
Impl::QthreadExec::clear_workers();
if ( Impl::s_number_workers ) {
qthread_finalize();
}
Impl::s_number_workers = 0 ;
Impl::s_number_shepherds = 0 ;
Impl::s_number_workers_per_shepherd = 0 ;
}
void Qthread::print_configuration( std::ostream & s , const bool detail )
{
s << "Kokkos::Qthread {"
<< " num_shepherds(" << Impl::s_number_shepherds << ")"
<< " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
<< " }" << std::endl ;
}
Qthread & Qthread::instance( int )
{
static Qthread q ;
return q ;
}
void Qthread::fence()
{
}
int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
aligned_t driver_exec_all( void * arg )
{
QthreadExec & exec = **worker_exec();
(*s_active_function)( exec , s_active_function_arg );
/*
fprintf( stdout
, "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, exec.worker_rank()
, exec.worker_size()
, exec.shepherd_rank()
, exec.shepherd_size()
, exec.shepherd_worker_rank()
, exec.shepherd_worker_size()
);
fflush(stdout);
*/
return 0 ;
}
aligned_t driver_resize_worker_scratch( void * arg )
{
static volatile int lock_begin = 0 ;
static volatile int lock_end = 0 ;
QthreadExec ** const exec = worker_exec();
//----------------------------------------
// Serialize allocation for thread safety
while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
const bool ok = 0 == *exec ;
if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
lock_begin = 0 ; // release lock
if ( ok ) { new( *exec ) QthreadExec(); }
//----------------------------------------
// Wait for all calls to complete to insure that each worker has executed.
if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
while ( lock_end );
/*
fprintf( stdout
, "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, (**exec).worker_rank()
, (**exec).worker_size()
, (**exec).shepherd_rank()
, (**exec).shepherd_size()
, (**exec).shepherd_worker_rank()
, (**exec).shepherd_worker_size()
);
fflush(stdout);
*/
//----------------------------------------
if ( ! ok ) {
fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
fflush( stderr );
}
return 0 ;
}
void verify_is_process( const char * const label , bool not_active = false )
{
const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
const bool is_active = not_active && ( s_active_function || s_active_function_arg );
if ( not_process || is_active ) {
std::string msg( label );
msg.append( " : FAILED" );
if ( not_process ) msg.append(" : not called by main process");
if ( is_active ) msg.append(" : parallel execution in progress");
Kokkos::Impl::throw_runtime_exception( msg );
}
}
}
int QthreadExec::worker_per_shepherd()
{
return s_number_workers_per_shepherd ;
}
QthreadExec::QthreadExec()
{
const int shepherd_rank = qthread_shep();
const int shepherd_worker_rank = qthread_worker_local(NULL);
const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
m_worker_base = s_exec ;
m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ;
m_reduce_end = s_worker_reduce_end ;
m_shepherd_rank = shepherd_rank ;
m_shepherd_size = s_number_shepherds ;
m_shepherd_worker_rank = shepherd_worker_rank ;
m_shepherd_worker_size = s_number_workers_per_shepherd ;
m_worker_rank = worker_rank ;
m_worker_size = s_number_workers ;
m_worker_state = QthreadExec::Active ;
}
void QthreadExec::clear_workers()
{
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
QthreadExec * const exec = s_exec[iwork] ;
s_exec[iwork] = 0 ;
free( exec );
}
}
void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
{
new( & space )
Qthread::scratch_memory_space(
((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
s_worker_shared_end - s_worker_shared_begin
);
}
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
{
const int exec_all_reduce_alloc = align_alloc( reduce_size );
const int shepherd_scan_alloc = align_alloc( 8 );
const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
if ( s_worker_reduce_end < exec_all_reduce_alloc ||
s_worker_shared_end < shepherd_shared_end ) {
/*
fprintf( stdout , "QthreadExec::resize\n");
fflush(stdout);
*/
// Clear current worker memory before allocating new worker memory
clear_workers();
// Increase the buffers to an aligned allocation
s_worker_reduce_end = exec_all_reduce_alloc ;
s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
s_worker_shared_end = shepherd_shared_end ;
// Need to query which shepherd this main 'process' is running...
const int main_shep = qthread_shep();
// Have each worker resize its memory for proper first-touch
#if 0
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
}}
#else
// If this function is used before the 'qthread.task_policy' unit test
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_resize_worker_scratch /* function */
, NULL /* function data block */
, NULL /* pointer to return value feb */
, jshep /* shepherd number */
, num_clone - 1 /* number of instances - 1 */
);
assert(ret == QTHREAD_SUCCESS);
}
}
#endif
driver_resize_worker_scratch( NULL );
// Verify all workers allocated
bool ok = true ;
for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
if ( ! ok ) {
std::ostringstream msg ;
msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
}
msg << " }" ;
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
}
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
{
verify_is_process("QthreadExec::exec_all(...)",true);
/*
fprintf( stdout , "QthreadExec::exec_all\n");
fflush(stdout);
*/
s_active_function = func ;
s_active_function_arg = arg ;
// Need to query which shepherd this main 'process' is running...
const int main_shep = qthread_shep();
#if 0
for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
}}
#else
// If this function is used before the 'qthread.task_policy' unit test
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_exec_all /* function */
, NULL /* function data block */
, NULL /* pointer to return value feb */
, jshep /* shepherd number */
, num_clone - 1 /* number of instances - 1 */
);
assert(ret == QTHREAD_SUCCESS);
}
}
#endif
driver_exec_all( NULL );
s_active_function = 0 ;
s_active_function_arg = 0 ;
}
void * QthreadExec::exec_all_reduce_result()
{
return s_exec[0]->m_scratch_alloc ;
}
} /* namespace Impl */
} /* namespace Kokkos */
namespace Kokkos {
namespace Impl {
QthreadTeamPolicyMember::QthreadTeamPolicyMember()
: m_exec( **worker_exec() )
, m_team_shared(0,0)
, m_team_size( 1 )
, m_team_rank( 0 )
, m_league_size(1)
, m_league_end(1)
, m_league_rank(0)
{
m_exec.shared_reset( m_team_shared );
}
QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
: m_exec( **worker_exec() )
, m_team_shared(0,0)
, m_team_size( s_number_workers_per_shepherd )
, m_team_rank( m_exec.shepherd_worker_rank() )
, m_league_size(1)
, m_league_end(1)
, m_league_rank(0)
{
m_exec.shared_reset( m_team_shared );
}
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */

View File

@ -1,620 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_QTHREADEXEC_HPP
#define KOKKOS_QTHREADEXEC_HPP
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
class QthreadExec ;
typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
class QthreadExec {
private:
enum { Inactive = 0 , Active = 1 };
const QthreadExec * const * m_worker_base ;
const QthreadExec * const * m_shepherd_base ;
void * m_scratch_alloc ; ///< Scratch memory [ reduce , team , shared ]
int m_reduce_end ; ///< End of scratch reduction memory
int m_shepherd_rank ;
int m_shepherd_size ;
int m_shepherd_worker_rank ;
int m_shepherd_worker_size ;
/*
* m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
* m_worker_size = m_shepherd_size * m_shepherd_worker_size
*/
int m_worker_rank ;
int m_worker_size ;
int mutable volatile m_worker_state ;
friend class Kokkos::Qthread ;
~QthreadExec();
QthreadExec( const QthreadExec & );
QthreadExec & operator = ( const QthreadExec & );
public:
QthreadExec();
/** Execute the input function on all available Qthread workers */
static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
//----------------------------------------
/** Barrier across all workers participating in the 'exec_all' */
void exec_all_barrier() const
{
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
/** Barrier across workers within the shepherd with rank < team_rank */
void shepherd_barrier( const int team_size ) const
{
if ( m_shepherd_worker_rank < team_size ) {
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
}
}
//----------------------------------------
/** Reduce across all workers participating in the 'exec_all' */
template< class FunctorType , class ReducerType , class ArgTag >
inline
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
{
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
const QthreadExec & fan = *m_worker_base[j];
Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
//----------------------------------------
/** Scall across all workers participating in the 'exec_all' */
template< class FunctorType , class ArgTag >
inline
void exec_all_scan( const FunctorType & func ) const
{
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , ArgTag > ValueOps ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_worker_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
for ( int i = 1 ; i < m_worker_size ; ++i ) {
ValueOps::copy( func
, m_worker_base[i-1]->m_scratch_alloc
, m_worker_base[i]->m_scratch_alloc
);
}
ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
// Join from lower ranking to higher ranking worker.
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
}
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
}
}
//----------------------------------------
template< class Type>
inline
volatile Type * shepherd_team_scratch_value() const
{ return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
template< class Type >
inline
void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
{
if ( m_shepherd_base ) {
Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
memory_fence();
shepherd_barrier( team_size );
value = *shared_value ;
}
}
template< class Type >
inline
Type shepherd_reduce( const int team_size , const Type & value ) const
{
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < n ; ++i ) {
accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
for ( int i = 1 ; i < n ; ++i ) {
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
template< class JoinOp >
inline
typename JoinOp::value_type
shepherd_reduce( const int team_size
, const typename JoinOp::value_type & value
, const JoinOp & op ) const
{
typedef typename JoinOp::value_type Type ;
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < team_size ; ++i ) {
op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
}
for ( int i = 1 ; i < team_size ; ++i ) {
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
template< class Type >
inline
Type shepherd_scan( const int team_size
, const Type & value
, Type * const global_value = 0 ) const
{
*shepherd_team_scratch_value<Type>() = value ;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n , j ;
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadExec::Inactive ;
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
}
else {
// Root thread scans across values before releasing threads
// Worker data is in reverse order, so m_shepherd_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1 ; i < team_size ; ++i ) {
const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
accum += tmp ;
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
}
* m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
// Join from lower ranking to higher ranking worker.
for ( int i = team_size ; --i ; ) {
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
memory_fence();
}
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
}
return *shepherd_team_scratch_value<Type>();
}
//----------------------------------------
static inline
int align_alloc( int size )
{
enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
}
void shared_reset( Qthread::scratch_memory_space & );
void * exec_all_reduce_value() const { return m_scratch_alloc ; }
static void * exec_all_reduce_result();
static void resize_worker_scratch( const int reduce_size , const int shared_size );
static void clear_workers();
//----------------------------------------
inline int worker_rank() const { return m_worker_rank ; }
inline int worker_size() const { return m_worker_size ; }
inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
inline int shepherd_rank() const { return m_shepherd_rank ; }
inline int shepherd_size() const { return m_shepherd_size ; }
static int worker_per_shepherd();
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class QthreadTeamPolicyMember {
private:
typedef Kokkos::Qthread execution_space ;
typedef execution_space::scratch_memory_space scratch_memory_space ;
Impl::QthreadExec & m_exec ;
scratch_memory_space m_team_shared ;
const int m_team_size ;
const int m_team_rank ;
const int m_league_size ;
const int m_league_end ;
int m_league_rank ;
public:
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_shmem() const { return m_team_shared ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{}
#else
{ m_exec.shepherd_barrier( m_team_size ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
#endif
template< typename JoinOp >
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return typename JoinOp::value_type(); }
#else
{ return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value ;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the
* league's parallel execution, be the scan's total.
* Parallel execution ordering of the league's teams is non-deterministic.
* As such the base value for each team's scan operation is similarly
* non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
#endif
//----------------------------------------
// Private driver for task-team parallel
struct TaskTeam {};
QthreadTeamPolicyMember();
explicit QthreadTeamPolicyMember( const TaskTeam & );
//----------------------------------------
// Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
// Initialize
template< class ... Properties >
QthreadTeamPolicyMember( Impl::QthreadExec & exec
, const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
: m_exec( exec )
, m_team_shared(0,0)
, m_team_size( team.m_team_size )
, m_team_rank( exec.shepherd_worker_rank() )
, m_league_size( team.m_league_size )
, m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
, m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
{
m_exec.shared_reset( m_team_shared );
}
// Continue
operator bool () const { return m_league_rank < m_league_end ; }
// iterate
void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
};
template< class ... Properties >
class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
: public PolicyTraits< Properties... >
{
private:
const int m_league_size ;
const int m_team_size ;
const int m_shepherd_iter ;
public:
//! Tag this class as a kokkos execution policy
typedef TeamPolicyInternal execution_policy ;
typedef Qthread execution_space ;
typedef PolicyTraits< Properties ... > traits ;
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return Qthread::instance().shepherd_worker_size(); }
template< class FunctorType >
static int team_size_recommended( const FunctorType & f )
{ return team_size_max( f ); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & f , const int& )
{ return team_size_max( f ); }
//----------------------------------------
inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; }
// One active team per shepherd
TeamPolicyInternal( Kokkos::Qthread & q
, const int league_size
, const int team_size
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( team_size < q.shepherd_worker_size()
? team_size : q.shepherd_worker_size() )
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
{
}
// One active team per shepherd
TeamPolicyInternal( const int league_size
, const int team_size
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( team_size < Qthread::instance().shepherd_worker_size()
? team_size : Qthread::instance().shepherd_worker_size() )
, m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
{
}
typedef Impl::QthreadTeamPolicyMember member_type ;
friend class Impl::QthreadTeamPolicyMember ;
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #define KOKKOS_QTHREADEXEC_HPP */

View File

@ -0,0 +1,519 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#include <utility>
#include <Kokkos_Qthreads.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Error.hpp>
// Defines to enable experimental Qthreads functionality.
//#define QTHREAD_LOCAL_PRIORITY
//#define CLONED_TASKS
//#include <qthread.h>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
/** s_exec is indexed by the reverse rank of the workers
* for faster fan-in / fan-out lookups
* [ n - 1, n - 2, ..., 0 ]
*/
QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
int s_number_shepherds = 0;
int s_number_workers_per_shepherd = 0;
int s_number_workers = 0;
inline
QthreadsExec ** worker_exec()
{
return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
}
const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
int s_worker_reduce_end = 0; // End of worker reduction memory.
int s_worker_shared_end = 0; // Total of worker scratch memory.
int s_worker_shared_begin = 0; // Beginning of worker shared memory.
QthreadsExecFunctionPointer volatile s_active_function = 0;
const void * volatile s_active_function_arg = 0;
} // namespace
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
int Qthreads::is_initialized()
{
return Impl::s_number_workers != 0;
}
int Qthreads::concurrency()
{
return Impl::s_number_workers_per_shepherd;
}
int Qthreads::in_parallel()
{
return Impl::s_active_function != 0;
}
void Qthreads::initialize( int thread_count )
{
// Environment variable: QTHREAD_NUM_SHEPHERDS
// Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
// Environment variable: QTHREAD_HWPAR
{
char buffer[256];
snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
putenv( buffer );
}
const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
( thread_count == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
( thread_count == qthread_num_workers() );
bool ok_symmetry = true;
if ( ok_init ) {
Impl::s_number_shepherds = qthread_num_shepherds();
Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
}
}
if ( ! ok_init || ! ok_symmetry ) {
std::ostringstream msg;
msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
msg << " : qthread_num_workers = " << qthread_num_workers();
if ( ! ok_symmetry ) {
msg << " : qthread_num_workers_local = {";
for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
msg << " " << qthread_num_workers_local( i );
}
msg << " }";
}
Impl::s_number_workers = 0;
Impl::s_number_shepherds = 0;
Impl::s_number_workers_per_shepherd = 0;
if ( ok_init ) { qthread_finalize(); }
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
// Init the array for used for arbitrarily sized atomics.
Impl::init_lock_array_host_space();
}
void Qthreads::finalize()
{
Impl::QthreadsExec::clear_workers();
if ( Impl::s_number_workers ) {
qthread_finalize();
}
Impl::s_number_workers = 0;
Impl::s_number_shepherds = 0;
Impl::s_number_workers_per_shepherd = 0;
}
void Qthreads::print_configuration( std::ostream & s, const bool detail )
{
s << "Kokkos::Qthreads {"
<< " num_shepherds(" << Impl::s_number_shepherds << ")"
<< " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
<< " }" << std::endl;
}
Qthreads & Qthreads::instance( int )
{
static Qthreads q;
return q;
}
void Qthreads::fence()
{
}
int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
aligned_t driver_exec_all( void * arg )
{
QthreadsExec & exec = **worker_exec();
(*s_active_function)( exec, s_active_function_arg );
/*
fprintf( stdout
, "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, exec.worker_rank()
, exec.worker_size()
, exec.shepherd_rank()
, exec.shepherd_size()
, exec.shepherd_worker_rank()
, exec.shepherd_worker_size()
);
fflush(stdout);
*/
return 0;
}
aligned_t driver_resize_worker_scratch( void * arg )
{
static volatile int lock_begin = 0;
static volatile int lock_end = 0;
QthreadsExec ** const exec = worker_exec();
//----------------------------------------
// Serialize allocation for thread safety.
while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
const bool ok = 0 == *exec;
if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
lock_begin = 0; // Release lock.
if ( ok ) { new( *exec ) QthreadsExec(); }
//----------------------------------------
// Wait for all calls to complete to insure that each worker has executed.
if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
while ( lock_end );
/*
fprintf( stdout
, "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
, (**exec).worker_rank()
, (**exec).worker_size()
, (**exec).shepherd_rank()
, (**exec).shepherd_size()
, (**exec).shepherd_worker_rank()
, (**exec).shepherd_worker_size()
);
fflush(stdout);
*/
//----------------------------------------
if ( ! ok ) {
fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
fflush( stderr );
}
return 0;
}
void verify_is_process( const char * const label, bool not_active = false )
{
const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
const bool is_active = not_active && ( s_active_function || s_active_function_arg );
if ( not_process || is_active ) {
std::string msg( label );
msg.append( " : FAILED" );
if ( not_process ) msg.append(" : not called by main process");
if ( is_active ) msg.append(" : parallel execution in progress");
Kokkos::Impl::throw_runtime_exception( msg );
}
}
} // namespace
int QthreadsExec::worker_per_shepherd()
{
return s_number_workers_per_shepherd;
}
QthreadsExec::QthreadsExec()
{
const int shepherd_rank = qthread_shep();
const int shepherd_worker_rank = qthread_worker_local( NULL );
const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
m_worker_base = s_exec;
m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
m_scratch_alloc = ( (unsigned char *) this ) + s_base_size;
m_reduce_end = s_worker_reduce_end;
m_shepherd_rank = shepherd_rank;
m_shepherd_size = s_number_shepherds;
m_shepherd_worker_rank = shepherd_worker_rank;
m_shepherd_worker_size = s_number_workers_per_shepherd;
m_worker_rank = worker_rank;
m_worker_size = s_number_workers;
m_worker_state = QthreadsExec::Active;
}
void QthreadsExec::clear_workers()
{
for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
QthreadsExec * const exec = s_exec[iwork];
s_exec[iwork] = 0;
free( exec );
}
}
void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
{
new( & space )
Qthreads::scratch_memory_space(
((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
s_worker_shared_end - s_worker_shared_begin
);
}
void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
{
const int exec_all_reduce_alloc = align_alloc( reduce_size );
const int shepherd_scan_alloc = align_alloc( 8 );
const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
if ( s_worker_reduce_end < exec_all_reduce_alloc ||
s_worker_shared_end < shepherd_shared_end ) {
/*
fprintf( stdout, "QthreadsExec::resize\n");
fflush(stdout);
*/
// Clear current worker memory before allocating new worker memory.
clear_workers();
// Increase the buffers to an aligned allocation.
s_worker_reduce_end = exec_all_reduce_alloc;
s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
s_worker_shared_end = shepherd_shared_end;
// Need to query which shepherd this main 'process' is running.
const int main_shep = qthread_shep();
// Have each worker resize its memory for proper first-touch.
#if 0
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
}
}
#else
// If this function is used before the 'qthreads.task_policy' unit test,
// the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_resize_worker_scratch // Function
, NULL // Function data block
, NULL // Pointer to return value feb
, jshep // Shepherd number
, num_clone - 1 // Number of instances - 1
);
assert( ret == QTHREAD_SUCCESS );
}
}
#endif
driver_resize_worker_scratch( NULL );
// Verify all workers allocated.
bool ok = true;
for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
if ( ! ok ) {
std::ostringstream msg;
msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
}
msg << " }";
Kokkos::Impl::throw_runtime_exception( msg.str() );
}
}
}
void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
{
verify_is_process("QthreadsExec::exec_all(...)",true);
/*
fprintf( stdout, "QthreadsExec::exec_all\n");
fflush(stdout);
*/
s_active_function = func;
s_active_function_arg = arg;
// Need to query which shepherd this main 'process' is running.
const int main_shep = qthread_shep();
#if 0
for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
}
}
#else
// If this function is used before the 'qthreads.task_policy' unit test,
// the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
if ( num_clone ) {
const int ret = qthread_fork_clones_to_local_priority
( driver_exec_all // Function
, NULL // Function data block
, NULL // Pointer to return value feb
, jshep // Shepherd number
, num_clone - 1 // Number of instances - 1
);
assert(ret == QTHREAD_SUCCESS);
}
}
#endif
driver_exec_all( NULL );
s_active_function = 0;
s_active_function_arg = 0;
}
void * QthreadsExec::exec_all_reduce_result()
{
return s_exec[0]->m_scratch_alloc;
}
} // namespace Impl
} // namespace Kokkos
namespace Kokkos {
namespace Impl {
QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
: m_exec( **worker_exec() )
, m_team_shared( 0, 0 )
, m_team_size( 1 )
, m_team_rank( 0 )
, m_league_size( 1 )
, m_league_end( 1 )
, m_league_rank( 0 )
{
m_exec.shared_reset( m_team_shared );
}
QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
: m_exec( **worker_exec() )
, m_team_shared( 0, 0 )
, m_team_size( s_number_workers_per_shepherd )
, m_team_rank( m_exec.shepherd_worker_rank() )
, m_league_size( 1 )
, m_league_end( 1 )
, m_league_rank( 0 )
{
m_exec.shared_reset( m_team_shared );
}
} // namespace Impl
} // namespace Kokkos
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )

View File

@ -0,0 +1,640 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_QTHREADSEXEC_HPP
#define KOKKOS_QTHREADSEXEC_HPP
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class QthreadsExec;
typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
class QthreadsExec {
private:
enum { Inactive = 0, Active = 1 };
const QthreadsExec * const * m_worker_base;
const QthreadsExec * const * m_shepherd_base;
void * m_scratch_alloc; ///< Scratch memory [ reduce, team, shared ]
int m_reduce_end; ///< End of scratch reduction memory
int m_shepherd_rank;
int m_shepherd_size;
int m_shepherd_worker_rank;
int m_shepherd_worker_size;
/*
* m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
* m_worker_size = m_shepherd_size * m_shepherd_worker_size
*/
int m_worker_rank;
int m_worker_size;
int mutable volatile m_worker_state;
friend class Kokkos::Qthreads;
~QthreadsExec();
QthreadsExec( const QthreadsExec & );
QthreadsExec & operator = ( const QthreadsExec & );
public:
QthreadsExec();
/** Execute the input function on all available Qthreads workers. */
static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
/** Barrier across all workers participating in the 'exec_all'. */
void exec_all_barrier() const
{
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
}
}
/** Barrier across workers within the shepherd with rank < team_rank. */
void shepherd_barrier( const int team_size ) const
{
if ( m_shepherd_worker_rank < team_size ) {
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
}
}
}
/** Reduce across all workers participating in the 'exec_all'. */
template< class FunctorType, class ReducerType, class ArgTag >
inline
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
{
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
const QthreadsExec & fan = *m_worker_base[j];
Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
}
}
/** Scan across all workers participating in the 'exec_all'. */
template< class FunctorType, class ArgTag >
inline
void exec_all_scan( const FunctorType & func ) const
{
typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
typedef Kokkos::Impl::FunctorValueOps< FunctorType, ArgTag > ValueOps;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
else {
// Root thread scans across values before releasing threads.
// Worker data is in reverse order, so m_worker_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
for ( int i = 1; i < m_worker_size; ++i ) {
ValueOps::copy( func
, m_worker_base[i-1]->m_scratch_alloc
, m_worker_base[i]->m_scratch_alloc
);
}
ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
// Join from lower ranking to higher ranking worker.
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
for ( int i = m_worker_size - 1; --i > 0; ) {
ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
}
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
}
}
//----------------------------------------
template< class Type >
inline
volatile Type * shepherd_team_scratch_value() const
{ return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
template< class Type >
inline
void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
{
if ( m_shepherd_base ) {
Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
memory_fence();
shepherd_barrier( team_size );
value = *shared_value;
}
}
template< class Type >
inline
Type shepherd_reduce( const int team_size, const Type & value ) const
{
volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
*shared_value = value;
// *shepherd_team_scratch_value<Type>() = value;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
else {
Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1; i < n; ++i ) {
accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
for ( int i = 1; i < n; ++i ) {
*m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
}
memory_fence();
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
}
return *shepherd_team_scratch_value<Type>();
}
template< class JoinOp >
inline
typename JoinOp::value_type
shepherd_reduce( const int team_size
, const typename JoinOp::value_type & value
, const JoinOp & op ) const
{
typedef typename JoinOp::value_type Type;
volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
*shared_value = value;
// *shepherd_team_scratch_value<Type>() = value;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
else {
volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1; i < team_size; ++i ) {
op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
}
for ( int i = 1; i < team_size; ++i ) {
*m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
}
memory_fence();
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
}
return *shepherd_team_scratch_value<Type>();
}
template< class Type >
inline
Type shepherd_scan( const int team_size
, const Type & value
, Type * const global_value = 0 ) const
{
*shepherd_team_scratch_value<Type>() = value;
memory_fence();
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
int n, j;
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
}
if ( rev_rank ) {
m_worker_state = QthreadsExec::Inactive;
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
}
else {
// Root thread scans across values before releasing threads.
// Worker data is in reverse order, so m_shepherd_base[0] is the
// highest ranking thread.
// Copy from lower ranking to higher ranking worker.
Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
for ( int i = 1; i < team_size; ++i ) {
const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
accum += tmp;
*m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
}
*m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
global_value ? atomic_fetch_add( global_value, accum ) : 0;
// Join from lower ranking to higher ranking worker.
for ( int i = team_size; --i; ) {
*m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
}
memory_fence();
}
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
}
return *shepherd_team_scratch_value<Type>();
}
//----------------------------------------
static inline
int align_alloc( int size )
{
enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
}
void shared_reset( Qthreads::scratch_memory_space & );
void * exec_all_reduce_value() const { return m_scratch_alloc; }
static void * exec_all_reduce_result();
static void resize_worker_scratch( const int reduce_size, const int shared_size );
static void clear_workers();
//----------------------------------------
inline int worker_rank() const { return m_worker_rank; }
inline int worker_size() const { return m_worker_size; }
inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
inline int shepherd_rank() const { return m_shepherd_rank; }
inline int shepherd_size() const { return m_shepherd_size; }
static int worker_per_shepherd();
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
class QthreadsTeamPolicyMember {
private:
typedef Kokkos::Qthreads execution_space;
typedef execution_space::scratch_memory_space scratch_memory_space;
Impl::QthreadsExec & m_exec;
scratch_memory_space m_team_shared;
const int m_team_size;
const int m_team_rank;
const int m_league_size;
const int m_league_end;
int m_league_rank;
public:
KOKKOS_INLINE_FUNCTION
const scratch_memory_space & team_shmem() const { return m_team_shared; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
KOKKOS_INLINE_FUNCTION void team_barrier() const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{}
#else
{ m_exec.shepherd_barrier( m_team_size ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
#endif
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
#endif
template< typename JoinOp >
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
team_reduce( const typename JoinOp::value_type & value
, const JoinOp & op ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return typename JoinOp::value_type(); }
#else
{ return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
*
* The highest rank thread can compute the reduction total as
* reduction_total = dev.team_scan( value ) + value;
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
#endif
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
* with intra-team non-deterministic ordering accumulation.
*
* The global inter-team accumulation value will, at the end of the league's
* parallel execution, be the scan's total. Parallel execution ordering of
* the league's teams is non-deterministic. As such the base value for each
* team's scan operation is similarly non-deterministic.
*/
template< typename Type >
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
{ return Type(); }
#else
{ return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
#endif
//----------------------------------------
// Private driver for task-team parallel.
struct TaskTeam {};
QthreadsTeamPolicyMember();
explicit QthreadsTeamPolicyMember( const TaskTeam & );
//----------------------------------------
// Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
// Initialize.
template< class ... Properties >
QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
, const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
: m_exec( exec )
, m_team_shared( 0, 0 )
, m_team_size( team.m_team_size )
, m_team_rank( exec.shepherd_worker_rank() )
, m_league_size( team.m_league_size )
, m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
, m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
{
m_exec.shared_reset( m_team_shared );
}
// Continue.
operator bool () const { return m_league_rank < m_league_end; }
// Iterate.
void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
};
template< class ... Properties >
class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
: public PolicyTraits< Properties... >
{
private:
const int m_league_size;
const int m_team_size;
const int m_shepherd_iter;
public:
//! Tag this class as a kokkos execution policy.
typedef TeamPolicyInternal execution_policy;
typedef Qthreads execution_space;
typedef PolicyTraits< Properties ... > traits;
//----------------------------------------
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return Qthreads::instance().shepherd_worker_size(); }
template< class FunctorType >
static int team_size_recommended( const FunctorType & f )
{ return team_size_max( f ); }
template< class FunctorType >
inline static
int team_size_recommended( const FunctorType & f, const int& )
{ return team_size_max( f ); }
//----------------------------------------
inline int team_size() const { return m_team_size; }
inline int league_size() const { return m_league_size; }
// One active team per shepherd.
TeamPolicyInternal( Kokkos::Qthreads & q
, const int league_size
, const int team_size
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( team_size < q.shepherd_worker_size()
? team_size : q.shepherd_worker_size() )
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
{}
// TODO: Make sure this is correct.
// One active team per shepherd.
TeamPolicyInternal( Kokkos::Qthreads & q
, const int league_size
, const Kokkos::AUTO_t & /* team_size_request */
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( q.shepherd_worker_size() )
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
{}
// One active team per shepherd.
TeamPolicyInternal( const int league_size
, const int team_size
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
? team_size : Qthreads::instance().shepherd_worker_size() )
, m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
{}
// TODO: Make sure this is correct.
// One active team per shepherd.
TeamPolicyInternal( const int league_size
, const Kokkos::AUTO_t & /* team_size_request */
, const int /* vector_length */ = 0
)
: m_league_size( league_size )
, m_team_size( Qthreads::instance().shepherd_worker_size() )
, m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
{}
// TODO: Doesn't do anything yet. Fix this.
/** \brief set chunk_size to a discrete value*/
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
TeamPolicyInternal p = *this;
// p.m_chunk_size = chunk_size_;
return p;
}
typedef Impl::QthreadsTeamPolicyMember member_type;
friend class Impl::QthreadsTeamPolicyMember;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
#endif // #define KOKKOS_QTHREADSEXEC_HPP

View File

@ -41,8 +41,8 @@
//@HEADER
*/
#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
#define KOKKOS_QTHREAD_PARALLEL_HPP
#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
#define KOKKOS_QTHREADS_PARALLEL_HPP
#include <vector>
@ -51,7 +51,7 @@
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <Qthread/Kokkos_QthreadExec.hpp>
#include <Qthreads/Kokkos_QthreadsExec.hpp>
//----------------------------------------------------------------------------
@ -63,7 +63,7 @@ namespace Impl {
template< class FunctorType , class ... Traits >
class ParallelFor< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Qthread
, Kokkos::Qthreads
>
{
private:
@ -99,7 +99,7 @@ private:
}
// Function is called once by every concurrent thread.
static void exec( QthreadExec & exec , const void * arg )
static void exec( QthreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
@ -116,7 +116,7 @@ public:
inline
void execute() const
{
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
}
@ -134,7 +134,7 @@ template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Qthread
, Kokkos::Qthreads
>
{
private:
@ -186,7 +186,7 @@ private:
}
}
static void exec( QthreadExec & exec , const void * arg )
static void exec( QthreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
@ -205,10 +205,10 @@ public:
inline
void execute() const
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
@ -246,11 +246,11 @@ public:
template< class FunctorType , class ... Properties >
class ParallelFor< FunctorType
, TeamPolicy< Properties ... >
, Kokkos::Qthread >
, Kokkos::Qthreads >
{
private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ;
@ -282,7 +282,7 @@ private:
}
}
static void exec( QthreadExec & exec , const void * arg )
static void exec( QthreadsExec & exec , const void * arg )
{
const ParallelFor & self = * ((const ParallelFor *) arg );
@ -297,10 +297,10 @@ public:
inline
void execute() const
{
QthreadExec::resize_worker_scratch
QthreadsExec::resize_worker_scratch
( /* reduction memory */ 0
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
}
ParallelFor( const FunctorType & arg_functor ,
@ -316,12 +316,12 @@ template< class FunctorType , class ReducerType , class ... Properties >
class ParallelReduce< FunctorType
, TeamPolicy< Properties... >
, ReducerType
, Kokkos::Qthread
, Kokkos::Qthreads
>
{
private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
@ -365,7 +365,7 @@ private:
}
}
static void exec( QthreadExec & exec , const void * arg )
static void exec( QthreadsExec & exec , const void * arg )
{
const ParallelReduce & self = * ((const ParallelReduce *) arg );
@ -383,13 +383,13 @@ public:
inline
void execute() const
{
QthreadExec::resize_worker_scratch
QthreadsExec::resize_worker_scratch
( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
@ -429,7 +429,7 @@ public:
template< class FunctorType , class ... Traits >
class ParallelScan< FunctorType
, Kokkos::RangePolicy< Traits ... >
, Kokkos::Qthread
, Kokkos::Qthreads
>
{
private:
@ -474,7 +474,7 @@ private:
}
}
static void exec( QthreadExec & exec , const void * arg )
static void exec( QthreadsExec & exec , const void * arg )
{
const ParallelScan & self = * ((const ParallelScan *) arg );
@ -497,8 +497,8 @@ public:
inline
void execute() const
{
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
}
ParallelScan( const FunctorType & arg_functor
@ -521,37 +521,37 @@ namespace Kokkos {
template< typename iType >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >
TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType& count )
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
{
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, count );
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
}
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::QthreadTeamPolicyMember >
TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
Impl::QthreadsTeamPolicyMember >
TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
{
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, iType(begin), iType(end) );
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
}
KOKKOS_INLINE_FUNCTION
Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
}
KOKKOS_INLINE_FUNCTION
Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
@ -560,7 +560,7 @@ Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::Qt
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
@ -571,7 +571,7 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qthrea
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
const Lambda & lambda, ValueType& result) {
result = ValueType();
@ -595,7 +595,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
@ -615,7 +615,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
loop_boundaries, const Lambda& lambda) {
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
@ -630,7 +630,7 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qthr
* val is performed and put into result. This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
loop_boundaries, const Lambda & lambda, ValueType& result) {
result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
@ -652,7 +652,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
* '1 for *'). This functionality requires C++11 support.*/
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
ValueType result = init_result;
@ -679,7 +679,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
* This functionality requires C++11 support.*/
template< typename iType, class FunctorType >
KOKKOS_INLINE_FUNCTION
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
loop_boundaries, const FunctorType & lambda) {
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
@ -697,25 +697,25 @@ void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qth
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
lambda();
}
template<class FunctorType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
if(single_struct.team_member.team_rank()==0) lambda();
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
lambda(val);
}
template<class FunctorType, class ValueType>
KOKKOS_INLINE_FUNCTION
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
if(single_struct.team_member.team_rank()==0) {
lambda(val);
}
@ -724,4 +724,4 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& singl
} // namespace Kokkos
#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */

View File

@ -0,0 +1,320 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Qthreads > ;
//----------------------------------------------------------------------------
TaskExec< Kokkos::Qthreads >::TaskExec()
: m_self_exec( 0 ),
m_team_exec( 0 ),
m_sync_mask( 0 ),
m_sync_value( 0 ),
m_sync_step( 0 ),
m_group_rank( 0 ),
m_team_rank( 0 ),
m_team_size( 1 )
{}
TaskExec< Kokkos::Qthreads >::
TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
: m_self_exec( & arg_exec ),
m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
m_sync_mask( 0 ),
m_sync_value( 0 ),
m_sync_step( 0 ),
m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
m_team_size( arg_team_size )
{
// This team spans
// m_self_exec->pool_rev( team_size * group_rank )
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
sync[0] = int64_t(0) ;
sync[1] = int64_t(0) ;
for ( int i = 0 ; i < m_team_size ; ++i ) {
m_sync_value |= int64_t(1) << (8*i);
m_sync_mask |= int64_t(3) << (8*i);
}
Kokkos::memory_fence();
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void TaskExec< Kokkos::Qthreads >::team_barrier() const
{
if ( 1 < m_team_size ) {
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
}
// Use team shared memory to synchronize.
// Alternate memory locations between barriers to avoid a sequence
// of barriers overtaking one another.
int64_t volatile * const sync =
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
// This team member sets one byte within the sync variable
int8_t volatile * const sync_self =
((int8_t *) sync) + m_team_rank ;
#if 0
fprintf( stdout,
"barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
m_group_rank,
m_team_rank,
m_sync_step,
m_sync_value,
*sync
);
fflush(stdout);
#endif
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
while ( m_sync_value != *sync ); // wait for team to arrive
#if 0
fprintf( stdout,
"barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
m_group_rank,
m_team_rank,
m_sync_step,
m_sync_value,
*sync
);
fflush(stdout);
#endif
++m_sync_step ;
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
m_sync_value ^= m_sync_mask ;
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
}
}
}
#endif
//----------------------------------------------------------------------------
void TaskQueueSpecialization< Kokkos::Qthreads >::execute
( TaskQueue< Kokkos::Qthreads > * const queue )
{
using execution_space = Kokkos::Qthreads ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space, void, void > ;
using PoolExec = Kokkos::Impl::QthreadsExec ;
using Member = TaskExec< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
// Required: team_size <= 8
const int team_size = PoolExec::pool_size(2); // Threads per core
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
if ( 8 < team_size ) {
Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
}
#pragma omp parallel
{
PoolExec & self = *PoolExec::get_thread_omp();
Member single_exec ;
Member team_exec( self, team_size );
// Team shared memory
task_root_type * volatile * const task_shared =
(task_root_type **) team_exec.m_team_exec->scratch_thread();
// Barrier across entire Qthreads thread pool to insure initialization
#pragma omp barrier
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == team_exec.team_rank() ) {
task_root_type * tmp =
0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
tmp = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
*task_shared = tmp ;
// Fence to be sure shared_task_array is stored
Kokkos::memory_fence();
}
// Whole team waits for every team member to reach this statement
team_exec.team_barrier();
Kokkos::memory_fence();
task_root_type * const task = *task_shared ;
#if 0
fprintf( stdout,
"\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
team_exec.m_group_rank,
team_exec.m_team_rank,
uintptr_t(task_shared),
uintptr_t(task)
);
fflush(stdout);
#endif
if ( 0 == task ) break ; // 0 == m_ready_count
if ( end == task ) {
team_exec.team_barrier();
}
else if ( task_root_type::TaskTeam == task->m_task_type ) {
// Thread Team Task
(*task->m_apply)( task, & team_exec );
// The m_apply function performs a barrier
if ( 0 == team_exec.team_rank() ) {
// team member #0 completes the task, which may delete the task
queue->complete( task );
}
}
else {
// Single Thread Task
if ( 0 == team_exec.team_rank() ) {
(*task->m_apply)( task, & single_exec );
queue->complete( task );
}
// All team members wait for whole team to reach this statement.
// Not necessary to complete the task.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
} while(1);
}
// END #pragma omp parallel
}
void TaskQueueSpecialization< Kokkos::Qthreads >::
iff_single_thread_recursive_execute
( TaskQueue< Kokkos::Qthreads > * const queue )
{
using execution_space = Kokkos::Qthreads ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space, void, void > ;
using Member = TaskExec< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec ;
task_root_type * task = end ;
do {
task = end ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
if ( end == task ) break ;
(*task->m_apply)( task, & single_exec );
queue->complete( task );
} while(1);
}
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,156 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
#define KOKKOS_IMPL_QTHREADS_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskQueueSpecialization< Kokkos::Qthreads >
{
public:
using execution_space = Kokkos::Qthreads ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
static
void iff_single_thread_recursive_execute( queue_type * const );
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< execution_space,
typename FunctorType::value_type,
FunctorType
> ;
*ptr = TaskType::apply ;
}
};
extern template class TaskQueue< Kokkos::Qthreads > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::Qthreads >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
using PoolExec = Kokkos::Impl::QthreadsExec ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
int64_t m_sync_mask ;
int64_t mutable m_sync_value ;
int mutable m_sync_step ;
int m_group_rank ; ///< Which "team" subset of thread pool
int m_team_rank ; ///< Which thread within a team
int m_team_size ;
TaskExec();
TaskExec( PoolExec & arg_exec, int arg_team_size );
public:
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void * team_shared() const
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
int team_shared_size() const
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
/**\brief Whole team enters this function call
* before any teeam member returns from
* this function call.
*/
void team_barrier() const ;
#else
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
#endif
KOKKOS_INLINE_FUNCTION
int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION
int team_size() const { return m_team_size ; }
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */

View File

@ -41,11 +41,11 @@
//@HEADER
*/
// Experimental unified task-data parallel manycore LDRD
// Experimental unified task-data parallel manycore LDRD.
#include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_ENABLE_QTHREAD )
#if defined( KOKKOS_ENABLE_QTHREADS )
#include <stdio.h>
@ -56,17 +56,15 @@
#include <string>
#include <Kokkos_Atomic.hpp>
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>
#if defined( KOKKOS_ENABLE_TASKDAG )
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Experimental {
namespace Impl {
typedef TaskMember< Kokkos::Qthread , void , void > Task ;
typedef TaskMember< Kokkos::Qthreads , void , void > Task ;
namespace {
@ -173,16 +171,16 @@ Task::TaskMember( const function_dealloc_type arg_dealloc
void Task::throw_error_add_dependence() const
{
std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
<< " state(" << m_state << ")"
<< " dep_size(" << m_dep_size << ")"
<< std::endl ;
throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
}
void Task::throw_error_verify_type()
{
throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
}
//----------------------------------------------------------------------------
@ -190,7 +188,7 @@ void Task::throw_error_verify_type()
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
{
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
static const char msg_error_count[] = ": negative reference count" ;
static const char msg_error_complete[] = ": destroy task that is not complete" ;
static const char msg_error_dependences[] = ": destroy task that has dependences" ;
@ -294,7 +292,7 @@ fflush(stdout);
assign( & m_dep[i] , 0 );
}
// Set qthread FEB to full so that dependent tasks are allowed to execute.
// Set Qthreads FEB to full so that dependent tasks are allowed to execute.
// This 'task' may be deleted immediately following this function call.
qthread_fill( & m_qfeb );
@ -319,10 +317,10 @@ aligned_t Task::qthread_func( void * arg )
);
if ( task->m_apply_team && ! task->m_apply_single ) {
Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;
// Initialize team size and rank with shephered info
Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );
(*task->m_apply_team)( task , member );
@ -344,7 +342,7 @@ fflush(stdout);
}
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
// Team hard-wired to one, no cloning
Kokkos::Impl::QthreadTeamPolicyMember member ;
Kokkos::Impl::QthreadsTeamPolicyMember member ;
(*task->m_apply_team)( task , member );
task->closeout();
}
@ -384,8 +382,8 @@ void Task::schedule()
// Increment active task count before spawning.
Kokkos::atomic_increment( m_active_count );
// spawn in qthread. must malloc the precondition array and give to qthread.
// qthread will eventually free this allocation so memory will not be leaked.
// spawn in Qthreads. must malloc the precondition array and give to Qthreads.
// Qthreads will eventually free this allocation so memory will not be leaked.
// concern with thread safety of malloc, does this need to be guarded?
aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
@ -393,7 +391,7 @@ void Task::schedule()
qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
for ( int i = 0 ; i < m_dep_size ; ++i ) {
qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
}
if ( m_apply_team && ! m_apply_single ) {
@ -446,7 +444,7 @@ fflush(stdout);
namespace Kokkos {
namespace Experimental {
TaskPolicy< Kokkos::Qthread >::
TaskPolicy< Kokkos::Qthreads >::
TaskPolicy
( const unsigned /* arg_task_max_count */
, const unsigned /* arg_task_max_size */
@ -462,7 +460,7 @@ TaskPolicy
if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
std::ostringstream msg ;
msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
<< "default_depedence = " << arg_task_default_dependence_capacity
<< " , team_size = " << arg_task_team_size
<< " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
@ -470,14 +468,14 @@ TaskPolicy
}
}
TaskPolicy< Kokkos::Qthread >::member_type &
TaskPolicy< Kokkos::Qthread >::member_single()
TaskPolicy< Kokkos::Qthreads >::member_type &
TaskPolicy< Kokkos::Qthreads >::member_single()
{
static member_type s ;
return s ;
}
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
{
volatile int * const active_task_count = & policy.m_active_count ;
while ( *active_task_count ) qthread_yield();
@ -486,6 +484,5 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
} // namespace Experimental
} // namespace Kokkos
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )

View File

@ -43,15 +43,15 @@
// Experimental unified task-data parallel manycore LDRD
#ifndef KOKKOS_QTHREAD_TASKSCHEDULER_HPP
#define KOKKOS_QTHREAD_TASKSCHEDULER_HPP
#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
#include <string>
#include <typeinfo>
#include <stdexcept>
//----------------------------------------------------------------------------
// Defines to enable experimental Qthread functionality
// Defines to enable experimental Qthreads functionality
#define QTHREAD_LOCAL_PRIORITY
#define CLONED_TASKS
@ -63,7 +63,7 @@
//----------------------------------------------------------------------------
#include <Kokkos_Qthread.hpp>
#include <Kokkos_Qthreads.hpp>
#include <Kokkos_TaskScheduler.hpp>
#include <Kokkos_View.hpp>
@ -78,13 +78,13 @@ namespace Experimental {
namespace Impl {
template<>
class TaskMember< Kokkos::Qthread , void , void >
class TaskMember< Kokkos::Qthreads , void , void >
{
public:
typedef TaskMember * (* function_verify_type) ( TaskMember * );
typedef void (* function_single_type) ( TaskMember * );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
private:
@ -94,7 +94,7 @@ private:
const function_single_type m_apply_single ; ///< Apply function
const function_team_type m_apply_team ; ///< Apply function
int volatile * const m_active_count ; ///< Count of active tasks on this policy
aligned_t m_qfeb ; ///< Qthread full/empty bit
aligned_t m_qfeb ; ///< Qthreads full/empty bit
TaskMember ** const m_dep ; ///< Dependences
const int m_dep_capacity ; ///< Capacity of dependences
int m_dep_size ; ///< Actual count of dependences
@ -129,7 +129,7 @@ protected :
~TaskMember();
// Used by TaskMember< Qthread , ResultType , void >
// Used by TaskMember< Qthreads , ResultType , void >
TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
@ -139,7 +139,7 @@ protected :
, const unsigned arg_dependence_capacity
);
// Used for TaskMember< Qthread , void , void >
// Used for TaskMember< Qthreads , void , void >
TaskMember( const function_dealloc_type arg_dealloc
, const function_single_type arg_apply_single
, const function_team_type arg_apply_team
@ -175,15 +175,15 @@ public:
/* Inheritence Requirements on task types:
* typedef FunctorType::value_type value_type ;
* class DerivedTaskType
* : public TaskMember< Qthread , value_type , FunctorType >
* : public TaskMember< Qthreads , value_type , FunctorType >
* { ... };
* class TaskMember< Qthread , value_type , FunctorType >
* : public TaskMember< Qthread , value_type , void >
* class TaskMember< Qthreads , value_type , FunctorType >
* : public TaskMember< Qthreads , value_type , void >
* , public Functor
* { ... };
* If value_type != void
* class TaskMember< Qthread , value_type , void >
* : public TaskMember< Qthread , void , void >
* class TaskMember< Qthreads , value_type , void >
* : public TaskMember< Qthreads , void , void >
*
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
*
@ -300,10 +300,10 @@ public:
KOKKOS_INLINE_FUNCTION static
void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthread , ResultType , void >
// TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthreads , ResultType , void >
// , public FunctorType
// { ... };
@ -316,10 +316,10 @@ public:
KOKKOS_INLINE_FUNCTION static
void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthread , ResultType , void >
// TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
// : public TaskMember< Kokkos::Qthreads , ResultType , void >
// , public FunctorType
// { ... };
@ -333,9 +333,9 @@ public:
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
, Kokkos::Impl::QthreadTeamPolicyMember & member )
, Kokkos::Impl::QthreadsTeamPolicyMember & member )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
derived_type & m = * static_cast< derived_type * >( t );
@ -345,9 +345,9 @@ public:
template< class FunctorType , class ResultType >
KOKKOS_INLINE_FUNCTION static
void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
, Kokkos::Impl::QthreadTeamPolicyMember & member )
, Kokkos::Impl::QthreadsTeamPolicyMember & member )
{
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
derived_type & m = * static_cast< derived_type * >( t );
@ -356,7 +356,7 @@ public:
};
//----------------------------------------------------------------------------
/** \brief Base class for tasks with a result value in the Qthread execution space.
/** \brief Base class for tasks with a result value in the Qthreads execution space.
*
* The FunctorType must be void because this class is accessed by the
* Future class for the task and result value.
@ -365,8 +365,8 @@ public:
* can correctly static_cast from the 'root class' to this class.
*/
template < class ResultType >
class TaskMember< Kokkos::Qthread , ResultType , void >
: public TaskMember< Kokkos::Qthread , void , void >
class TaskMember< Kokkos::Qthreads , ResultType , void >
: public TaskMember< Kokkos::Qthreads , void , void >
{
public:
@ -379,7 +379,7 @@ public:
protected:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthreads , void , void > task_root_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_team_type function_team_type ;
@ -404,16 +404,16 @@ protected:
};
template< class ResultType , class FunctorType >
class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
: public TaskMember< Kokkos::Qthread , ResultType , void >
class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
: public TaskMember< Kokkos::Qthreads , ResultType , void >
, public FunctorType
{
public:
typedef FunctorType functor_type ;
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
typedef TaskMember< Kokkos::Qthreads , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthreads , ResultType , void > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_team_type function_team_type ;
@ -447,16 +447,16 @@ public:
namespace Kokkos {
namespace Experimental {
void wait( TaskPolicy< Kokkos::Qthread > & );
void wait( TaskPolicy< Kokkos::Qthreads > & );
template<>
class TaskPolicy< Kokkos::Qthread >
class TaskPolicy< Kokkos::Qthreads >
{
public:
typedef Kokkos::Qthread execution_space ;
typedef Kokkos::Qthreads execution_space ;
typedef TaskPolicy execution_policy ;
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
typedef Kokkos::Impl::QthreadsTeamPolicyMember member_type ;
private:
@ -650,7 +650,7 @@ public:
static member_type & member_single();
friend void wait( TaskPolicy< Kokkos::Qthread > & );
friend void wait( TaskPolicy< Kokkos::Qthreads > & );
};
} /* namespace Experimental */
@ -660,5 +660,5 @@ public:
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
#endif /* #define KOKKOS_QTHREADS_TASK_HPP */

View File

@ -0,0 +1,319 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Manage task allocation, deallocation, and scheduling.
*
* Task execution is handled here directly for the Qthread implementation.
*/
template<>
class TaskQueue< Kokkos::Qthread > {
private:
using execution_space = Kokkos::Qthread ;
using memory_space = Kokkos::HostSpace
using device_type = Kokkos::Device< execution_space, memory_space > ;
using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ;
using task_root_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
friend class Kokkos::TaskScheduler< execution_space > ;
struct Destroy {
TaskQueue * m_queue ;
void destroy_shared_allocation();
};
//----------------------------------------
enum : int { TASK_STATE_NULL = 0, ///< Does not exist
TASK_STATE_CONSTRUCTING = 1, ///< Is under construction
TASK_STATE_WAITING = 2, ///< Is waiting for execution
TASK_STATE_EXECUTING = 4, ///< Is executing
TASK_STATE_RESPAWN = 8, ///< Requested respawn
TASK_STATE_COMPLETE = 16 ///< Execution is complete
};
// Queue is organized as [ priority ][ type ]
memory_pool m_memory ;
unsigned m_team_size ; // Number of threads in a team
long m_accum_alloc ; // Accumulated number of allocations
int m_count_alloc ; // Current number of allocations
int m_max_alloc ; // Maximum number of allocations
int m_ready_count ; // Number of ready or executing
//----------------------------------------
~TaskQueue();
TaskQueue() = delete ;
TaskQueue( TaskQueue && ) = delete ;
TaskQueue( TaskQueue const & ) = delete ;
TaskQueue & operator = ( TaskQueue && ) = delete ;
TaskQueue & operator = ( TaskQueue const & ) = delete ;
TaskQueue
( const memory_space & arg_space,
unsigned const arg_memory_pool_capacity,
unsigned const arg_memory_pool_superblock_capacity_log2
);
// Schedule a task
// Precondition:
// task is not executing
// task->m_next is the dependence or zero
// Postcondition:
// task->m_next is linked list membership
KOKKOS_FUNCTION
void schedule( task_root_type * const );
// Reschedule a task
// Precondition:
// task is in Executing state
// task->m_next == LockTag
// Postcondition:
// task is in Executing-Respawn state
// task->m_next == 0 (no dependence)
KOKKOS_FUNCTION
void reschedule( task_root_type * );
// Complete a task
// Precondition:
// task is not executing
// task->m_next == LockTag => task is complete
// task->m_next != LockTag => task is respawn
// Postcondition:
// task->m_wait == LockTag => task is complete
// task->m_wait != LockTag => task is waiting
KOKKOS_FUNCTION
void complete( task_root_type * );
public:
// If and only if the execution space is a single thread
// then execute ready tasks.
KOKKOS_INLINE_FUNCTION
void iff_single_thread_recursive_execute()
{
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
specialization::iff_single_thread_recursive_execute( this );
#endif
}
void execute() { specialization::execute( this ); }
template< typename FunctorType >
void proc_set_apply( typename task_root_type::function_type * ptr )
{
specialization::template proc_set_apply< FunctorType >( ptr );
}
// Assign task pointer with reference counting of assigned tasks
template< typename LV, typename RV >
KOKKOS_FUNCTION static
void assign( TaskBase< execution_space, LV, void > ** const lhs,
TaskBase< execution_space, RV, void > * const rhs )
{
using task_lhs = TaskBase< execution_space, LV, void > ;
#if 0
{
printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
uintptr_t( lhs ? *lhs : 0 ),
uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
uintptr_t(rhs),
uintptr_t( rhs ? rhs->m_next : 0 ),
int( rhs ? rhs->m_task_type : 0 ),
int( rhs ? rhs->m_ref_count : 0 )
);
fflush( stdout );
}
#endif
if ( *lhs )
{
const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
// Reference count is zero and task is complete, deallocate.
(*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
}
else if ( count <= 1 ) {
Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
}
// GEM: Should I check that there are no dependences here? Can the state
// be set to complete while there are still dependences?
}
if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
// Force write of *lhs
*static_cast< task_lhs * volatile * >(lhs) = rhs ;
Kokkos::memory_fence();
}
KOKKOS_FUNCTION
size_t allocate_block_size( size_t n ); ///< Actual block size allocated
KOKKOS_FUNCTION
void * allocate( size_t n ); ///< Allocate from the memory pool
KOKKOS_FUNCTION
void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskBase< Kokkos::Qthread, void, void >
{
public:
enum : int16_t { TaskTeam = TaskBase< void, void, void >::TaskTeam,
TaskSingle = TaskBase< void, void, void >::TaskSingle,
Aggregate = TaskBase< void, void, void >::Aggregate };
enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
EndTag = TaskBase< void, void, void >::EndTag };
using execution_space = Kokkos::Qthread ;
using queue_type = TaskQueue< execution_space > ;
template< typename > friend class Kokkos::TaskScheduler ;
typedef void (* function_type) ( TaskBase *, void * );
// sizeof(TaskBase) == 48
function_type m_apply ; ///< Apply function pointer
queue_type * m_queue ; ///< Queue in which this task resides
TaskBase * m_dep ; ///< Dependence
int32_t m_ref_count ; ///< Reference count
int32_t m_alloc_size ; ///< Allocation size
int32_t m_dep_count ; ///< Aggregate's number of dependences
int16_t m_task_type ; ///< Type of task
int16_t m_priority ; ///< Priority of runnable task
aligned_t m_qfeb ; ///< Qthread full/empty bit
int m_state ; ///< State of the task
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
TaskBase & operator = ( const TaskBase & ) = delete ;
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
KOKKOS_INLINE_FUNCTION
constexpr TaskBase() noexcept
: m_apply(0),
m_queue(0),
m_dep(0),
m_ref_count(0),
m_alloc_size(0),
m_dep_count(0),
m_task_type( TaskSingle ),
m_priority( 1 /* TaskRegularPriority */ ),
m_qfeb(0),
m_state( queue_type::TASK_STATE_CONSTRUCTING )
{
qthread_empty( & m_qfeb ); // Set to full when complete
}
//----------------------------------------
static aligned_t qthread_func( void * arg );
KOKKOS_INLINE_FUNCTION
TaskBase ** aggregate_dependences()
{ return reinterpret_cast<TaskBase**>( this + 1 ); }
KOKKOS_INLINE_FUNCTION
void requested_respawn()
{ return m_state == queue_type::TASK_STATE_RESPAWN; }
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskBase* dep )
{
// Assign dependence to m_dep. It will be processed in the subsequent
// call to schedule. Error if the dependence is reset.
if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
}
if ( 0 != dep ) {
// The future may be destroyed upon returning from this call
// so increment reference count to track this assignment.
Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
}
}
using get_return_type = void ;
KOKKOS_INLINE_FUNCTION
get_return_type get() const {}
};
} /* namespace Impl */
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,436 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#if defined( KOKKOS_ENABLE_TASKPOLICY )
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
template< typename ExecSpace >
void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
{
m_queue->~TaskQueue();
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
TaskQueue< ExecSpace >::TaskQueue
( const TaskQueue< ExecSpace >::memory_space & arg_space,
unsigned const arg_memory_pool_capacity,
unsigned const arg_memory_pool_superblock_capacity_log2 )
: m_memory( arg_space,
arg_memory_pool_capacity,
arg_memory_pool_superblock_capacity_log2 )
m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
m_accum_alloc(0),
m_count_alloc(0),
m_max_alloc(0),
m_ready_count(0)
{}
//----------------------------------------------------------------------------
template< typename ExecSpace >
TaskQueue< ExecSpace >::~TaskQueue()
{
// Verify that ready count is zero.
if ( 0 != m_ready_count ) {
Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
}
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
{
return m_memory.allocate_block_size( n );
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void * TaskQueue< ExecSpace >::allocate( size_t n )
{
void * const p = m_memory.allocate(n);
if ( p ) {
Kokkos::atomic_increment( & m_accum_alloc );
Kokkos::atomic_increment( & m_count_alloc );
if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
}
return p ;
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
{
m_memory.deallocate( p, n );
Kokkos::atomic_decrement( & m_count_alloc );
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::schedule
( TaskQueue< ExecSpace >::task_root_type * const task )
{
#if 0
printf( "schedule( 0x%lx { %d %d %d }\n",
uintptr_t(task),
task->m_task_type,
task->m_priority,
task->m_ref_count );
#endif
// The task has been constructed and is waiting to be executed.
task->m_state = TASK_STATE_WAITING ;
if ( task->m_task_type != task_root_type::Aggregate ) {
// Scheduling a single or team task.
// Increment active task count before spawning.
Kokkos::atomic_increment( m_ready_count );
if ( task->m_dep == 0 ) {
// Schedule a task with no dependences.
if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
// If more than one shepherd spawn on a shepherd other than this shepherd
const int num_shepherd = qthread_num_shepherds();
const int this_shepherd = qthread_shep();
int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
#if 0
fprintf( stdout,
"worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
qthread_shep(),
qthread_worker_local(NULL),
reinterpret_cast<unsigned long>(this),
spawn_shepherd,
m_team_size - 1
);
fflush(stdout);
#endif
qthread_spawn_cloneable(
& task_root_type::qthread_func,
task,
0,
NULL,
0, // no depenedences
0, // dependences array
spawn_shepherd,
unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
m_team_size - 1
);
}
else {
qthread_spawn(
& task_root_type::qthread_func,
task,
0,
NULL,
0, // no depenedences
0, // dependences array
NO_SHEPHERD,
QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
);
}
}
else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
// Malloc the precondition array to pass to qthread_spawn(). For
// non-aggregate tasks, it is a single pointer since there are no
// dependences. Qthreads will eventually free this allocation so memory will
// not be leaked. Is malloc thread-safe? Should this call be guarded? The
// memory can't be allocated from the pool allocator because Qthreads frees
// it using free().
aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
*qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
// If more than one shepherd spawn on a shepherd other than this shepherd
const int num_shepherd = qthread_num_shepherds();
const int this_shepherd = qthread_shep();
int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
#if 0
fprintf( stdout,
"worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
qthread_shep(),
qthread_worker_local(NULL),
reinterpret_cast<unsigned long>(this),
spawn_shepherd,
m_team_size - 1
);
fflush(stdout);
#endif
qthread_spawn_cloneable(
& Task::qthread_func,
this,
0,
NULL,
m_dep_size,
qprecon, /* dependences */
spawn_shepherd,
unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
m_team_size - 1
);
}
else {
qthread_spawn(
& Task::qthread_func, /* function */
this, /* function argument */
0,
NULL,
m_dep_size,
qprecon, /* dependences */
NO_SHEPHERD,
QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
);
}
}
else {
// GEM: How do I handle an aggregate (when_all) task?
}
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
{
// Precondition:
// task is in Executing state
// task->m_next == LockTag
//
// Postcondition:
// task is in Executing-Respawn state
// task->m_next == 0 (no dependence)
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
}
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::complete
( TaskQueue< ExecSpace >::task_root_type * task )
{
// Complete a runnable task that has finished executing
// or a when_all task when all of its dependeneces are complete.
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
#if 0
printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
uintptr_t(task),
uintptr_t(task->m_wait),
uintptr_t(task->m_next),
task->m_task_type,
task->m_priority,
task->m_ref_count
);
fflush( stdout );
#endif
const bool runnable = task_root_type::Aggregate != task->m_task_type ;
//----------------------------------------
if ( runnable && lock != task->m_next ) {
// Is a runnable task has finished executing and requested respawn.
// Schedule the task for subsequent execution.
schedule( task );
}
//----------------------------------------
else {
// Is either an aggregate or a runnable task that executed
// and did not respawn. Transition this task to complete.
// If 'task' is an aggregate then any of the runnable tasks that
// it depends upon may be attempting to complete this 'task'.
// Must only transition a task once to complete status.
// This is controled by atomically locking the wait queue.
// Stop other tasks from adding themselves to this task's wait queue
// by locking the head of this task's wait queue.
task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
if ( x != (task_root_type *) lock ) {
// This thread has transitioned this 'task' to complete.
// 'task' is no longer in a queue and is not executing
// so decrement the reference count from 'task's creation.
// If no other references to this 'task' then it will be deleted.
TaskQueue::assign( & task, zero );
// This thread has exclusive access to the wait list so
// the concurrency-safe pop_task function is not needed.
// Schedule the tasks that have been waiting on the input 'task',
// which may have been deleted.
while ( x != end ) {
// Set x->m_next = zero <= no dependence
task_root_type * const next =
(task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
schedule( x );
x = next ;
}
}
}
if ( runnable ) {
// A runnable task was popped from a ready queue and executed.
// If respawned into a ready queue then the ready count was incremented
// so decrement whether respawned or not.
Kokkos::atomic_decrement( & m_ready_count );
}
}
//----------------------------------------------------------------------------
template<>
aligned_t
TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
{
using execution_space = Kokkos::Qthreads ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = Kokkos::Impl::QthreadsTeamPolicyMember;
task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
// First member of the team change state to executing.
// Use compare-exchange to avoid race condition with a respawn.
Kokkos::atomic_compare_exchange_strong( & task->m_state,
queue_type::TASK_STATE_WAITING,
queue_type::TASK_STATE_EXECUTING
);
if ( task_root_type::TaskTeam == task->m_task_type )
{
if ( 1 < task->m_queue->m_team_size ) {
// Team task with team size of more than 1.
Member::TaskTeam task_team_tag ;
// Initialize team size and rank with shephered info
Member member( task_team_tag );
(*task->m_apply)( task , & member );
#if 0
fprintf( stdout,
"worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
qthread_shep(),
qthread_worker_local(NULL),
reinterpret_cast<unsigned long>(task),
member.team_rank(),
member.team_size()
);
fflush(stdout);
#endif
member.team_barrier();
if ( member.team_rank() == 0 ) task->closeout();
member.team_barrier();
}
else {
// Team task with team size of 1.
Member member ;
(*task->m_apply)( task , & member );
task->closeout();
}
}
else {
(*task->m_apply)( task );
task->closeout();
}
#if 0
fprintf( stdout
, "worker(%d.%d) task 0x%.12lx return\n"
, qthread_shep()
, qthread_worker_local(NULL)
, reinterpret_cast<unsigned long>(task)
);
fflush(stdout);
#endif
return 0 ;
}
} /* namespace Impl */
} /* namespace Kokkos */
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -22,4 +22,3 @@ sh autogen.sh
# install
make install

View File

@ -264,7 +264,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
for ( int i = 0 ; i < n ; ++i ) {
Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
exec.m_pool_state = ThreadsExec::Inactive ;
@ -308,7 +308,7 @@ void ThreadsExec::fence()
{
if ( s_thread_pool_size[0] ) {
// Wait for the root thread to complete:
Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
}
s_current_function = 0 ;
@ -724,7 +724,7 @@ void ThreadsExec::initialize( unsigned thread_count ,
// Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
@ -777,7 +777,7 @@ void ThreadsExec::finalize()
s_threads_process.m_pool_fan_size = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}

View File

@ -187,13 +187,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -229,13 +229,13 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the reduction and broadcast
@ -264,7 +264,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , reduce_memory() , fan.reduce_memory() );
}
@ -280,7 +280,7 @@ public:
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
}
}
@ -312,7 +312,7 @@ public:
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: Active -> ReductionAvailable (or ScanAvailable)
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
Join::join( f , work_value , fan.reduce_memory() );
}
@ -330,8 +330,8 @@ public:
// Wait: Active -> ReductionAvailable
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
}
@ -342,7 +342,7 @@ public:
// Wait for all threads to complete inclusive scan
// Wait: ScanAvailable -> Rendezvous
Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
}
//--------------------------------
@ -350,7 +350,7 @@ public:
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
// Wait: ReductionAvailable -> ScanAvailable
Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
// Set: ScanAvailable -> Rendezvous
fan.m_pool_state = ThreadsExec::Rendezvous ;
}
@ -377,13 +377,13 @@ public:
// Wait for all threads to copy previous thread's inclusive scan value
// Wait for all threads: Rendezvous -> ScanCompleted
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
}
if ( rev_rank ) {
// Set: ScanAvailable -> ScanCompleted
m_pool_state = ThreadsExec::ScanCompleted ;
// Wait: ScanCompleted -> Active
Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
}
// Set: ScanCompleted -> Active
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
@ -410,7 +410,7 @@ public:
// Fan-in reduction with highest ranking thread as the root
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
// Wait: Active -> Rendezvous
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
}
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
@ -418,7 +418,7 @@ public:
if ( rev_rank ) {
m_pool_state = ThreadsExec::Rendezvous ;
// Wait: Rendezvous -> Active
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
}
else {
// Root thread does the thread-scan before releasing threads

View File

@ -49,6 +49,7 @@
#include <utility>
#include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <Kokkos_Atomic.hpp>
@ -103,13 +104,13 @@ public:
// Wait for fan-in threads
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
}
// If not root then wait for release
if ( m_team_rank_rev ) {
m_exec->state() = ThreadsExec::Rendezvous ;
Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
}
return ! m_team_rank_rev ;
@ -350,6 +351,10 @@ public:
const int team_rank_rev = pool_rank_rev % team.team_alloc();
const size_t pool_league_size = m_exec->pool_size() / team.team_alloc() ;
const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
if(pool_league_rank_rev >= pool_league_size) {
m_invalid_thread = 1;
return;
}
const size_t pool_league_rank = pool_league_size - ( pool_league_rank_rev + 1 );
const int pool_num_teams = m_exec->pool_size()/team.team_alloc();
@ -505,7 +510,8 @@ private:
, const int team_size_request )
{
const int pool_size = traits::execution_space::thread_pool_size(0);
const int team_max = traits::execution_space::thread_pool_size(1);
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
const int team_grain = traits::execution_space::thread_pool_size(2);
m_league_size = league_size_request ;
@ -552,8 +558,12 @@ public:
template< class FunctorType >
inline static
int team_size_max( const FunctorType & )
{ return traits::execution_space::thread_pool_size(1); }
int team_size_max( const FunctorType & ) {
int pool_size = traits::execution_space::thread_pool_size(1);
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
return pool_size<max_host_team_size?pool_size:max_host_team_size;
}
template< class FunctorType >
static int team_size_recommended( const FunctorType & )
@ -819,9 +829,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
result+=tmp;
lambda(i,result);
}
}
@ -835,18 +843,14 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {
ValueType result = init_result;
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
lambda(i,result);
}
init_result = result;
}
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)

File diff suppressed because it is too large Load Diff

View File

@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#elif defined( __INTEL_COMPILER )
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_forward(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __cnttz4(i);
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#else
unsigned t = 1u;
int r = 0;
while ( i && ( i & t == 0 ) )
@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( KOKKOS_COMPILER_INTEL )
return _bit_scan_reverse(i);
#elif defined( KOKKOS_COMPILER_IBM )
return shift - __cntlz4(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#elif defined( __INTEL_COMPILER )
return _bit_scan_reverse(i);
#else
unsigned t = 1u << shift;
int r = 0;
@ -101,10 +104,12 @@ int bit_count( unsigned i )
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#elif defined( KOKKOS_COMPILER_IBM )
return __popcnt4(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#else
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
i = i - ( ( i >> 1 ) & ~0u / 3u ); // temp

View File

@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
}
#endif
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
void finalize_internal( const bool all_spaces = false )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
@ -449,5 +449,323 @@ void fence()
Impl::fence_internal();
}
void print_configuration( std::ostream & out , const bool detail )
{
std::ostringstream msg;
msg << "Compiler:" << std::endl;
#ifdef KOKKOS_COMPILER_APPLECC
msg << " KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_CLANG
msg << " KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
#endif
#ifdef KOKKOS_COMPILER_CRAYC
msg << " KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_GNU
msg << " KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
#endif
#ifdef KOKKOS_COMPILER_IBM
msg << " KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
#endif
#ifdef KOKKOS_COMPILER_INTEL
msg << " KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
#endif
#ifdef KOKKOS_COMPILER_NVCC
msg << " KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
#endif
#ifdef KOKKOS_COMPILER_PGI
msg << " KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
#endif
msg << "Architecture:" << std::endl;
#ifdef KOKKOS_ENABLE_ISA_KNC
msg << " KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_ISA_X86_64
msg << " KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
#else
msg << " KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
#endif
msg << "Devices:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA: ";
#ifdef KOKKOS_ENABLE_CUDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_OPENMP: ";
#ifdef KOKKOS_ENABLE_OPENMP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PTHREAD: ";
#ifdef KOKKOS_ENABLE_PTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_STDTHREAD: ";
#ifdef KOKKOS_ENABLE_STDTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_WINTHREAD: ";
#ifdef KOKKOS_ENABLE_WINTHREAD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_QTHREADS: ";
#ifdef KOKKOS_ENABLE_QTHREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_SERIAL: ";
#ifdef KOKKOS_ENABLE_SERIAL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Default Device:" << std::endl;
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Atomics:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA_ATOMICS: ";
#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_GNU_ATOMICS: ";
#ifdef KOKKOS_ENABLE_GNU_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_INTEL_ATOMICS: ";
#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_OPENMP_ATOMICS: ";
#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Vectorization:" << std::endl;
msg << " KOKKOS_ENABLE_PRAGMA_IVDEP: ";
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_SIMD: ";
#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_UNROLL: ";
#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PRAGMA_VECTOR: ";
#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Memory:" << std::endl;
msg << " KOKKOS_ENABLE_HBWSPACE: ";
#ifdef KOKKOS_ENABLE_HBWSPACE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_POSIX_MEMALIGN: ";
#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << "Options:" << std::endl;
msg << " KOKKOS_ENABLE_ASM: ";
#ifdef KOKKOS_ENABLE_ASM
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CXX1Z: ";
#ifdef KOKKOS_ENABLE_CXX1Z
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_HWLOC: ";
#ifdef KOKKOS_ENABLE_HWLOC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_LIBRT: ";
#ifdef KOKKOS_ENABLE_LIBRT
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_MPI: ";
#ifdef KOKKOS_ENABLE_MPI
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_PROFILING: ";
#ifdef KOKKOS_ENABLE_PROFILING
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
#ifdef KOKKOS_ENABLE_CUDA
msg << "Cuda Options:" << std::endl;
msg << " KOKKOS_ENABLE_CUDA_LAMBDA: ";
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUDA_UVM: ";
#ifdef KOKKOS_ENABLE_CUDA_UVM
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CUSPARSE: ";
#ifdef KOKKOS_ENABLE_CUSPARSE
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
msg << "yes" << std::endl;
#else
msg << "no" << std::endl;
#endif
#endif
msg << "\nRuntime Configuration:" << std::endl;
#ifdef KOKKOS_ENABLE_CUDA
Cuda::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_OPENMP
OpenMP::print_configuration(msg, detail);
#endif
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
Threads::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_QTHREADS
Qthreads::print_configuration(msg, detail);
#endif
#ifdef KOKKOS_ENABLE_SERIAL
Serial::print_configuration(msg, detail);
#endif
out << msg.str() << std::endl;
}
} // namespace Kokkos

View File

@ -0,0 +1,653 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_FUNCTORANALYSIS_HPP
#define KOKKOS_FUNCTORANALYSIS_HPP
#include <cstddef>
#include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Reducer.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
struct FunctorPatternInterface {
struct FOR {};
struct REDUCE {};
struct SCAN {};
};
/** \brief Query Functor and execution policy argument tag for value type.
*
* If 'value_type' is not explicitly declared in the functor
* then attempt to deduce the type from FunctorType::operator()
* interface used by the pattern and policy.
*
* For the REDUCE pattern generate a Reducer and finalization function
* derived from what is available within the functor.
*/
template< typename PatternInterface , class Policy , class Functor >
struct FunctorAnalysis {
private:
using FOR = FunctorPatternInterface::FOR ;
using REDUCE = FunctorPatternInterface::REDUCE ;
using SCAN = FunctorPatternInterface::SCAN ;
//----------------------------------------
struct VOID {};
template< typename P = Policy , typename = std::false_type >
struct has_work_tag
{
using type = void ;
using wtag = VOID ;
};
template< typename P >
struct has_work_tag
< P , typename std::is_same< typename P::work_tag , void >::type >
{
using type = typename P::work_tag ;
using wtag = typename P::work_tag ;
};
using Tag = typename has_work_tag<>::type ;
using WTag = typename has_work_tag<>::wtag ;
//----------------------------------------
// Check for Functor::value_type, which is either a simple type T or T[]
template< typename F , typename = std::false_type >
struct has_value_type { using type = void ; };
template< typename F >
struct has_value_type
< F , typename std::is_same< typename F::value_type , void >::type >
{
using type = typename F::value_type ;
static_assert( ! std::is_reference< type >::value &&
std::rank< type >::value <= 1 &&
std::extent< type >::value == 0
, "Kokkos Functor::value_type is T or T[]" );
};
//----------------------------------------
// If Functor::value_type does not exist then evaluate operator(),
// depending upon the pattern and whether the policy has a work tag,
// to determine the reduction or scan value_type.
template< typename F
, typename P = PatternInterface
, typename V = typename has_value_type<F>::type
, bool T = std::is_same< Tag , void >::value
>
struct deduce_value_type { using type = V ; };
template< typename F >
struct deduce_value_type< F , REDUCE , void , true > {
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( M , A & ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , REDUCE , void , false > {
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag , M , A & ) const );
template< typename M , typename A >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag const & , M , A & ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , SCAN , void , true > {
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( M , A & , I ) const );
using type = decltype( deduce( & F::operator() ) );
};
template< typename F >
struct deduce_value_type< F , SCAN , void , false > {
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag , M , A & , I ) const );
template< typename M , typename A , typename I >
KOKKOS_INLINE_FUNCTION static
A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
using type = decltype( deduce( & F::operator() ) );
};
//----------------------------------------
using candidate_type = typename deduce_value_type< Functor >::type ;
enum { candidate_is_void = std::is_same< candidate_type , void >::value
, candidate_is_array = std::rank< candidate_type >::value == 1 };
//----------------------------------------
public:
using value_type = typename std::remove_extent< candidate_type >::type ;
static_assert( ! std::is_const< value_type >::value
, "Kokkos functor operator reduce argument cannot be const" );
private:
// Stub to avoid defining a type 'void &'
using ValueType = typename
std::conditional< candidate_is_void , VOID , value_type >::type ;
public:
using pointer_type = typename
std::conditional< candidate_is_void , void , ValueType * >::type ;
using reference_type = typename
std::conditional< candidate_is_array , ValueType * , typename
std::conditional< ! candidate_is_void , ValueType & , void >
::type >::type ;
private:
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< IsArray , unsigned >::type
get_length( FF const & f ) { return f.value_count ; }
template< bool IsArray , class FF >
KOKKOS_INLINE_FUNCTION static
typename std::enable_if< ! IsArray , unsigned >::type
get_length( FF const & ) { return 1 ; }
public:
enum { StaticValueSize = ! candidate_is_void &&
! candidate_is_array
? sizeof(ValueType) : 0 };
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_count( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f); }
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_size( const Functor & f )
{ return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
//----------------------------------------
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_count( const Unknown & )
{ return 1 ; }
template< class Unknown >
KOKKOS_FORCEINLINE_FUNCTION static
unsigned value_size( const Unknown & )
{ return sizeof(ValueType); }
private:
enum INTERFACE : int
{ DISABLE = 0
, NO_TAG_NOT_ARRAY = 1
, NO_TAG_IS_ARRAY = 2
, HAS_TAG_NOT_ARRAY = 3
, HAS_TAG_IS_ARRAY = 4
, DEDUCED =
! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
std::is_same<Tag,void>::value
? (candidate_is_array ? NO_TAG_IS_ARRAY : NO_TAG_NOT_ARRAY)
: (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
};
//----------------------------------------
// parallel_reduce join operator
template< class F , INTERFACE >
struct has_join_function ;
template< class F >
struct has_join_function< F , NO_TAG_NOT_ARRAY >
{
typedef volatile ValueType & vref_type ;
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( *dst , *src ); }
};
template< class F >
struct has_join_function< F , NO_TAG_IS_ARRAY >
{
typedef volatile ValueType * vref_type ;
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( dst , src ); }
};
template< class F >
struct has_join_function< F , HAS_TAG_NOT_ARRAY >
{
typedef volatile ValueType & vref_type ;
typedef volatile const ValueType & cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , *dst , *src ); }
};
template< class F >
struct has_join_function< F , HAS_TAG_IS_ARRAY >
{
typedef volatile ValueType * vref_type ;
typedef volatile const ValueType * cvref_type ;
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{ f.join( WTag() , dst , src ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceJoin
{
KOKKOS_INLINE_FUNCTION static
void join( F const & f
, ValueType volatile * dst
, ValueType volatile const * src )
{
const int n = FunctorAnalysis::value_count( f );
for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
}
};
template< class F >
struct DeduceJoin< F , DISABLE , void >
{
KOKKOS_INLINE_FUNCTION static
void join( F const &
, ValueType volatile *
, ValueType volatile const * ) {}
};
template< class F , INTERFACE I >
struct DeduceJoin< F , I ,
decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
: public has_join_function<F,I> {};
//----------------------------------------
template< class , INTERFACE >
struct has_init_function ;
template< class F >
struct has_init_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( *dst ); }
};
template< class F >
struct has_init_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), *dst ); }
};
template< class F >
struct has_init_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void init( F const & f , ValueType * dst )
{ f.init( WTag(), dst ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceInit
{
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
};
template< class F >
struct DeduceInit< F , DISABLE , void >
{
KOKKOS_INLINE_FUNCTION static
void init( F const & , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceInit< F , I ,
decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
: public has_init_function<F,I> {};
//----------------------------------------
public:
struct Reducer
{
private:
Functor const & m_functor ;
ValueType * const m_result ;
int const m_length ;
public:
using reducer = Reducer ;
using value_type = FunctorAnalysis::value_type ;
using memory_space = void ;
using reference_type = FunctorAnalysis::reference_type ;
KOKKOS_INLINE_FUNCTION
void join( ValueType volatile * dst
, ValueType volatile const * src ) const noexcept
{ DeduceJoin<>::join( m_functor , dst , src ); }
KOKKOS_INLINE_FUNCTION
void init( ValueType * dst ) const noexcept
{ DeduceInit<>::init( m_functor , dst ); }
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( Functor const & arg_functor
, ValueType * arg_value = 0
, int arg_length = 0 ) noexcept
: m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
KOKKOS_INLINE_FUNCTION
constexpr int length() const noexcept { return m_length ; }
KOKKOS_INLINE_FUNCTION
ValueType & operator[]( int i ) const noexcept
{ return m_result[i]; }
private:
template< bool IsArray >
constexpr
typename std::enable_if< IsArray , ValueType * >::type
ref() const noexcept { return m_result ; }
template< bool IsArray >
constexpr
typename std::enable_if< ! IsArray , ValueType & >::type
ref() const noexcept { return *m_result ; }
public:
KOKKOS_INLINE_FUNCTION
auto result() const noexcept
-> decltype( Reducer::template ref< candidate_is_array >() )
{ return Reducer::template ref< candidate_is_array >(); }
};
//----------------------------------------
private:
template< class , INTERFACE >
struct has_final_function ;
// No tag, not array
template< class F >
struct has_final_function< F , NO_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( *dst ); }
};
// No tag, is array
template< class F >
struct has_final_function< F , NO_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( dst ); }
};
// Has tag, not array
template< class F >
struct has_final_function< F , HAS_TAG_NOT_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType & ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), *dst ); }
};
// Has tag, is array
template< class F >
struct has_final_function< F , HAS_TAG_IS_ARRAY >
{
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (F::*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void enable_if( void (*)( WTag const & , ValueType * ) );
KOKKOS_INLINE_FUNCTION static
void final( F const & f , ValueType * dst )
{ f.final( WTag(), dst ); }
};
template< class F = Functor
, INTERFACE = DEDUCED
, typename = void >
struct DeduceFinal
{
KOKKOS_INLINE_FUNCTION
static void final( F const & , ValueType * ) {}
};
template< class F , INTERFACE I >
struct DeduceFinal< F , I ,
decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
: public has_init_function<F,I> {};
public:
static void final( Functor const & f , ValueType * result )
{ DeduceFinal<>::final( f , result ); }
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* KOKKOS_FUNCTORANALYSIS_HPP */

View File

@ -62,7 +62,7 @@
#include <memkind.h>
#endif
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}

View File

@ -43,7 +43,7 @@
#include <algorithm>
#include <Kokkos_Macros.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_Interface.hpp>
#endif
/*--------------------------------------------------------------------------*/
@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
SharedAllocationRecord< Kokkos::HostSpace , void >::
~SharedAllocationRecord()
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::deallocateData(
Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
)
, m_space( arg_space )
{
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
}

View File

@ -0,0 +1,463 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <limits>
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_HostThreadTeam.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_spinwait.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
void HostThreadTeamData::organize_pool
( HostThreadTeamData * members[] , const int size )
{
bool ok = true ;
// Verify not already a member of a pool:
for ( int rank = 0 ; rank < size && ok ; ++rank ) {
ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
}
if ( ok ) {
int64_t * const root_scratch = members[0]->m_scratch ;
for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
root_scratch[i] = 0 ;
}
{
HostThreadTeamData ** const pool =
(HostThreadTeamData **) (root_scratch + m_pool_members);
// team size == 1, league size == pool_size
for ( int rank = 0 ; rank < size ; ++rank ) {
HostThreadTeamData * const mem = members[ rank ] ;
mem->m_pool_scratch = root_scratch ;
mem->m_team_scratch = mem->m_scratch ;
mem->m_pool_rank = rank ;
mem->m_pool_size = size ;
mem->m_team_base = rank ;
mem->m_team_rank = 0 ;
mem->m_team_size = 1 ;
mem->m_team_alloc = 1 ;
mem->m_league_rank = rank ;
mem->m_league_size = size ;
mem->m_pool_rendezvous_step = 0 ;
mem->m_team_rendezvous_step = 0 ;
pool[ rank ] = mem ;
}
}
Kokkos::memory_fence();
}
else {
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
}
}
void HostThreadTeamData::disband_pool()
{
m_work_range.first = -1 ;
m_work_range.second = -1 ;
m_pool_scratch = 0 ;
m_team_scratch = 0 ;
m_pool_rank = 0 ;
m_pool_size = 1 ;
m_team_base = 0 ;
m_team_rank = 0 ;
m_team_size = 1 ;
m_team_alloc = 1 ;
m_league_rank = 0 ;
m_league_size = 1 ;
m_pool_rendezvous_step = 0 ;
m_team_rendezvous_step = 0 ;
}
int HostThreadTeamData::organize_team( const int team_size )
{
// Pool is initialized
const bool ok_pool = 0 != m_pool_scratch ;
// Team is not set
const bool ok_team =
m_team_scratch == m_scratch &&
m_team_base == m_pool_rank &&
m_team_rank == 0 &&
m_team_size == 1 &&
m_team_alloc == 1 &&
m_league_rank == m_pool_rank &&
m_league_size == m_pool_size ;
if ( ok_pool && ok_team ) {
if ( team_size <= 0 ) return 0 ; // No teams to organize
if ( team_size == 1 ) return 1 ; // Already organized in teams of one
HostThreadTeamData * const * const pool =
(HostThreadTeamData **) (m_pool_scratch + m_pool_members);
// "league_size" in this context is the number of concurrent teams
// that the pool can accommodate. Excess threads are idle.
const int league_size = m_pool_size / team_size ;
const int team_alloc_size = m_pool_size / league_size ;
const int team_alloc_rank = m_pool_rank % team_alloc_size ;
const int league_rank = m_pool_rank / team_alloc_size ;
const int team_base_rank = league_rank * team_alloc_size ;
m_team_scratch = pool[ team_base_rank ]->m_scratch ;
m_team_base = team_base_rank ;
// This needs to check overflow, if m_pool_size % team_alloc_size !=0
// there are two corner cases:
// (i) if team_alloc_size == team_size there might be a non-full
// zombi team around (for example m_pool_size = 5 and team_size = 2
// (ii) if team_alloc > team_size then the last team might have less
// threads than the others
m_team_rank = ( team_base_rank + team_size <= m_pool_size ) &&
( team_alloc_rank < team_size ) ?
team_alloc_rank : -1;
m_team_size = team_size ;
m_team_alloc = team_alloc_size ;
m_league_rank = league_rank ;
m_league_size = league_size ;
m_team_rendezvous_step = 0 ;
if ( team_base_rank == m_pool_rank ) {
// Initialize team's rendezvous memory
for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
m_scratch[i] = 0 ;
}
// Make sure team's rendezvous memory initialized
// is written before proceeding.
Kokkos::memory_fence();
}
// Organizing threads into a team performs a barrier across the
// entire pool to insure proper initialization of the team
// rendezvous mechanism before a team rendezvous can be performed.
if ( pool_rendezvous() ) {
pool_rendezvous_release();
}
}
else {
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
}
return 0 <= m_team_rank ;
}
void HostThreadTeamData::disband_team()
{
m_team_scratch = m_scratch ;
m_team_base = m_pool_rank ;
m_team_rank = 0 ;
m_team_size = 1 ;
m_team_alloc = 1 ;
m_league_rank = m_pool_rank ;
m_league_size = m_pool_size ;
m_team_rendezvous_step = 0 ;
}
//----------------------------------------------------------------------------
/* pattern for rendezvous
*
* if ( rendezvous() ) {
* ... all other threads are still in team_rendezvous() ...
* rendezvous_release();
* ... all other threads are released from team_rendezvous() ...
* }
*/
int HostThreadTeamData::rendezvous( int64_t * const buffer
, int & rendezvous_step
, int const size
, int const rank ) noexcept
{
enum : int { shift_byte = 3 };
enum : int { size_byte = ( 01 << shift_byte ) }; // == 8
enum : int { mask_byte = size_byte - 1 };
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
// Cycle step values: 1 <= step <= size_val_cycle
// An odd multiple of memory cycle so that when a memory location
// is reused it has a different value.
// Must be representable within a single byte: size_val_cycle < 16
enum : int { size_val_cycle = 3 * size_mem_cycle };
// Requires:
// Called by rank = [ 0 .. size )
// buffer aligned to int64_t[4]
// A sequence of rendezvous uses four cycled locations in memory
// and non-equal cycled synchronization values to
// 1) prevent rendezvous from overtaking one another and
// 2) give each spin wait location an int64_t[4] span
// so that it has its own cache line.
const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
rendezvous_step = step ;
// The leading int64_t[4] span is for thread 0 to write
// and all other threads to read spin-wait.
// sync_offset is the index into this array for this step.
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
union {
int64_t full ;
int8_t byte[8] ;
} value ;
if ( rank ) {
const int group_begin = rank << shift_byte ; // == rank * size_byte
if ( group_begin < size ) {
// This thread waits for threads
// [ group_begin .. group_begin + 8 )
// [ rank*8 .. rank*8 + 8 )
// to write to their designated bytes.
const int end = group_begin + size_byte < size
? size_byte : size - group_begin ;
value.full = 0 ;
for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
store_fence(); // This should not be needed but fixes #742
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
, value.full );
}
{
// This thread sets its designated byte.
// ( rank % size_byte ) +
// ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
// ( sync_offset * size_byte )
const int offset = ( rank & mask_byte )
+ ( ( rank & ~mask_byte ) << shift_mem_cycle )
+ ( sync_offset << shift_byte );
// All of this thread's previous memory stores must be complete before
// this thread stores the step value at this thread's designated byte
// in the shared synchronization array.
Kokkos::memory_fence();
((volatile int8_t*) buffer)[ offset ] = int8_t( step );
// Memory fence to push the previous store out
Kokkos::memory_fence();
}
// Wait for thread 0 to release all other threads
spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
}
else {
// Thread 0 waits for threads [1..7]
// to write to their designated bytes.
const int end = size_byte < size ? 8 : size ;
value.full = 0 ;
for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
spinwait_until_equal( buffer[ sync_offset ], value.full );
}
return rank ? 0 : 1 ;
}
void HostThreadTeamData::
rendezvous_release( int64_t * const buffer
, int const rendezvous_step ) noexcept
{
enum : int { shift_mem_cycle = 2 };
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
// Requires:
// Called after team_rendezvous
// Called only by true == team_rendezvous(root)
// Memory fence to be sure all previous writes are complete:
Kokkos::memory_fence();
((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
int64_t( rendezvous_step );
// Memory fence to push the store out
Kokkos::memory_fence();
}
//----------------------------------------------------------------------------
int HostThreadTeamData::get_work_stealing() noexcept
{
pair_int_t w( -1 , -1 );
if ( 1 == m_team_size || team_rendezvous() ) {
// Attempt first from beginning of my work range
for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
// Query and attempt to update m_work_range
// from: [ w.first , w.second )
// to: [ w.first + 1 , w.second ) = w_new
//
// If w is invalid then is just a query.
const pair_int_t w_new( w.first + 1 , w.second );
w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
if ( w.first < w.second ) {
// m_work_range is viable
// If steal is successful then don't repeat attempt to steal
attempt = ! ( w_new.first == w.first + 1 &&
w_new.second == w.second );
}
else {
// m_work_range is not viable
w.first = -1 ;
w.second = -1 ;
attempt = 0 ;
}
}
if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
HostThreadTeamData * const * const pool =
(HostThreadTeamData**)( m_pool_scratch + m_pool_members );
// Attempt from begining failed, try to steal from end of neighbor
pair_int_t volatile * steal_range =
& ( pool[ m_steal_rank ]->m_work_range );
for ( int attempt = true ; attempt ; ) {
// Query and attempt to update steal_work_range
// from: [ w.first , w.second )
// to: [ w.first , w.second - 1 ) = w_new
//
// If w is invalid then is just a query.
const pair_int_t w_new( w.first , w.second - 1 );
w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
if ( w.first < w.second ) {
// steal_work_range is viable
// If steal is successful then don't repeat attempt to steal
attempt = ! ( w_new.first == w.first &&
w_new.second == w.second - 1 );
}
else {
// steal_work_range is not viable, move to next member
w.first = -1 ;
w.second = -1 ;
// We need to figure out whether the next team is active
// m_steal_rank + m_team_alloc could be the next base_rank to steal from
// but only if there are another m_team_size threads available so that that
// base rank has a full team.
m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
m_steal_rank + m_team_alloc : 0;
steal_range = & ( pool[ m_steal_rank ]->m_work_range );
// If tried all other members then don't repeat attempt to steal
attempt = m_steal_rank != m_pool_rank ;
}
}
if ( w.first != -1 ) w.first = w.second - 1 ;
}
if ( 1 < m_team_size ) {
// Must share the work index
*((int volatile *) team_reduce()) = w.first ;
team_rendezvous_release();
}
}
else if ( 1 < m_team_size ) {
w.first = *((int volatile *) team_reduce());
}
// May exit because successfully stole work and w is good.
// May exit because no work left to steal and w = (-1,-1).
#if 0
fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
, m_pool_rank , m_pool_size , w.first );
fflush(stdout);
#endif
return w.first ;
}
} // namespace Impl
} // namespace Kokkos

File diff suppressed because it is too large Load Diff

View File

@ -52,6 +52,10 @@ void memory_fence()
{
#if defined( __CUDA_ARCH__ )
__threadfence();
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
asm volatile (
"mfence" ::: "memory"
);
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
__sync_synchronize();

View File

@ -129,8 +129,8 @@
#endif
#ifdef KOKKOS_HAVE_CUDA_RDC
#ifndef KOKKOS_ENABLE_CUDA_RDC
#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
#endif
#endif
@ -242,9 +242,9 @@
#endif
#endif
#ifdef KOKKOS_HAVE_QTHREAD
#ifndef KOKKOS_ENABLE_QTHREAD
#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
#ifdef KOKKOS_HAVE_QTHREADS
#ifndef KOKKOS_ENABLE_QTHREADS
#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
#endif
#endif

View File

@ -43,7 +43,7 @@
#include <impl/Kokkos_Profiling_Interface.hpp>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <string.h>
namespace Kokkos {

View File

@ -50,7 +50,7 @@
#include <string>
#include <cinttypes>
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
#include <dlfcn.h>
#include <iostream>
@ -59,7 +59,7 @@
#define KOKKOSP_INTERFACE_VERSION 20150628
#if (KOKKOS_ENABLE_PROFILING)
#if defined(KOKKOS_ENABLE_PROFILING)
namespace Kokkos {
namespace Profiling {

View File

@ -0,0 +1,317 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_REDUCER_HPP
#define KOKKOS_IMPL_REDUCER_HPP
#include <impl/Kokkos_Traits.hpp>
//----------------------------------------------------------------------------
/* Reducer abstraction:
* 1) Provides 'join' operation
* 2) Provides 'init' operation
* 3) Provides 'copy' operation
* 4) Optionally provides result value in a memory space
*
* Created from:
* 1) Functor::operator()( destination , source )
* 2) Functor::{ join , init )
*/
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template< typename value_type >
struct ReduceSum
{
KOKKOS_INLINE_FUNCTION static
void copy( value_type & dest
, value_type const & src ) noexcept
{ dest = src ; }
KOKKOS_INLINE_FUNCTION static
void init( value_type & dest ) noexcept
{ new( &dest ) value_type(); }
KOKKOS_INLINE_FUNCTION static
void join( value_type volatile & dest
, value_type const volatile & src ) noexcept
{ dest += src ; }
KOKKOS_INLINE_FUNCTION static
void join( value_type & dest
, value_type const & src ) noexcept
{ dest += src ; }
};
template< typename T
, class ReduceOp = ReduceSum< T >
, typename MemorySpace = void >
struct Reducer
: private ReduceOp
, private integral_nonzero_constant
< int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
{
private:
// Determine if T is simple array
enum : int { rank = std::rank<T>::value };
static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
using length_t =
integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
public:
using reducer = Reducer ;
using memory_space = MemorySpace ;
using value_type = typename std::remove_extent<T>::type ;
using reference_type =
typename std::conditional< ( rank != 0 )
, value_type *
, value_type &
>::type ;
private:
//--------------------------------------------------------------------------
// Determine what functions 'ReduceOp' provides:
// copy( destination , source )
// init( destination )
//
// operator()( destination , source )
// join( destination , source )
//
// Provide defaults for missing optional operations
template< class R , typename = void>
struct COPY {
KOKKOS_INLINE_FUNCTION static
void copy( R const &
, value_type * dst
, value_type const * src ) { *dst = *src ; }
};
template< class R >
struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
, *((value_type const *)0) ) ) >
{
KOKKOS_INLINE_FUNCTION static
void copy( R const & r
, value_type * dst
, value_type const * src ) { r.copy( *dst , *src ); }
};
template< class R , typename = void >
struct INIT {
KOKKOS_INLINE_FUNCTION static
void init( R const & , value_type * dst ) { new(dst) value_type(); }
};
template< class R >
struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
{
KOKKOS_INLINE_FUNCTION static
void init( R const & r , value_type * dst ) { r.init( *dst ); }
};
template< class R , typename V , typename = void > struct JOIN
{
// If no join function then try operator()
KOKKOS_INLINE_FUNCTION static
void join( R const & r , V * dst , V const * src )
{ r.operator()(*dst,*src); }
};
template< class R , typename V >
struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
{
// If has join function use it
KOKKOS_INLINE_FUNCTION static
void join( R const & r , V * dst , V const * src )
{ r.join(*dst,*src); }
};
//--------------------------------------------------------------------------
value_type * const m_result ;
template< int Rank >
KOKKOS_INLINE_FUNCTION
static constexpr
typename std::enable_if< ( 0 != Rank ) , reference_type >::type
ref( value_type * p ) noexcept { return p ; }
template< int Rank >
KOKKOS_INLINE_FUNCTION
static constexpr
typename std::enable_if< ( 0 == Rank ) , reference_type >::type
ref( value_type * p ) noexcept { return *p ; }
public:
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
constexpr int length() const noexcept
{ return length_t::value ; }
KOKKOS_INLINE_FUNCTION
value_type * data() const noexcept
{ return m_result ; }
KOKKOS_INLINE_FUNCTION
reference_type reference() const noexcept
{ return Reducer::template ref< rank >( m_result ); }
//--------------------------------------------------------------------------
KOKKOS_INLINE_FUNCTION
void copy( value_type * const dest
, value_type const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
}
}
KOKKOS_INLINE_FUNCTION
void init( value_type * dest ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
}
}
KOKKOS_INLINE_FUNCTION
void join( value_type * const dest
, value_type const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
}
}
KOKKOS_INLINE_FUNCTION
void join( value_type volatile * const dest
, value_type volatile const * const src ) const noexcept
{
for ( int i = 0 ; i < length() ; ++i ) {
Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
}
}
//--------------------------------------------------------------------------
template< typename ArgT >
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer
( ArgT * arg_value
, typename std::enable_if
< std::is_same<ArgT,value_type>::value &&
std::is_default_constructible< ReduceOp >::value
, int >::type arg_length = 1
) noexcept
: ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( ReduceOp const & arg_op
, value_type * arg_value = 0
, int arg_length = 1 ) noexcept
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
KOKKOS_INLINE_FUNCTION explicit
constexpr Reducer( ReduceOp && arg_op
, value_type * arg_value = 0
, int arg_length = 1 ) noexcept
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
Reducer( Reducer const & ) = default ;
Reducer( Reducer && ) = default ;
Reducer & operator = ( Reducer const & ) = default ;
Reducer & operator = ( Reducer && ) = default ;
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< typename ValueType >
constexpr
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
Sum( ValueType & arg_value )
{
static_assert( std::is_trivial<ValueType>::value
, "Kokkos reducer requires trivial value type" );
return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
}
template< typename ValueType >
constexpr
Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
Sum( ValueType * arg_value , int arg_length )
{
static_assert( std::is_trivial<ValueType>::value
, "Kokkos reducer requires trivial value type" );
return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
}
//----------------------------------------------------------------------------
template< typename ValueType , class JoinType >
Impl::Reducer< ValueType , JoinType >
reducer( ValueType & value , JoinType const & lambda )
{
return Impl::Reducer< ValueType , JoinType >( lambda , & value );
}
} // namespace Kokkos
#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */

View File

@ -53,63 +53,126 @@
namespace Kokkos {
namespace Impl {
namespace SerialImpl {
namespace {
Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
HostThreadTeamData g_serial_thread_team_data ;
Sentinel::~Sentinel()
{
if ( m_scratch ) { free( m_scratch ); }
m_scratch = 0 ;
m_reduce_end = 0 ;
m_shared_end = 0 ;
}
Sentinel & Sentinel::singleton()
// Resize thread team data scratch memory
void serial_resize_thread_team_data( size_t pool_reduce_bytes
, size_t team_reduce_bytes
, size_t team_shared_bytes
, size_t thread_local_bytes )
{
static Sentinel s ; return s ;
}
if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
inline
unsigned align( unsigned n )
{
enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
return ( n + MASK ) & ~MASK ;
}
const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
} // namespace
// Allocate if any of the old allocation is tool small:
SerialTeamMember::SerialTeamMember( int arg_league_rank
, int arg_league_size
, int arg_shared_size
)
: m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
, arg_shared_size )
, m_league_rank( arg_league_rank )
, m_league_size( arg_league_size )
{}
const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) ||
( old_team_reduce < team_reduce_bytes ) ||
( old_team_shared < team_shared_bytes ) ||
( old_thread_local < thread_local_bytes );
} // namespace Impl
if ( allocate ) {
void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
{
static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
Kokkos::HostSpace space ;
reduce_size = Impl::SerialImpl::align( reduce_size );
shared_size = Impl::SerialImpl::align( shared_size );
if ( old_alloc_bytes ) {
g_serial_thread_team_data.disband_team();
g_serial_thread_team_data.disband_pool();
if ( ( s.m_reduce_end < reduce_size ) ||
( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
if ( s.m_scratch ) { free( s.m_scratch ); }
if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
s.m_scratch = malloc( s.m_shared_end );
space.deallocate( g_serial_thread_team_data.scratch_buffer()
, g_serial_thread_team_data.scratch_bytes() );
}
return s.m_scratch ;
if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
const size_t alloc_bytes =
HostThreadTeamData::scratch_size( pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
void * const ptr = space.allocate( alloc_bytes );
g_serial_thread_team_data.
scratch_assign( ((char *)ptr)
, alloc_bytes
, pool_reduce_bytes
, team_reduce_bytes
, team_shared_bytes
, thread_local_bytes );
HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
g_serial_thread_team_data.organize_pool( pool , 1 );
g_serial_thread_team_data.organize_team(1);
}
}
// Get thread team data structure for omp_get_thread_num()
HostThreadTeamData * serial_get_thread_team_data()
{
return & g_serial_thread_team_data ;
}
} // namespace Impl
} // namespace Kokkos
/*--------------------------------------------------------------------------*/
namespace Kokkos {
int Serial::is_initialized()
{
return 1 ;
}
void Serial::initialize( unsigned threads_count
, unsigned use_numa_count
, unsigned use_cores_per_numa
, bool allow_asynchronous_threadpool )
{
(void) threads_count;
(void) use_numa_count;
(void) use_cores_per_numa;
(void) allow_asynchronous_threadpool;
// Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space();
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
void Serial::finalize()
{
if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
Impl::g_serial_thread_team_data.disband_team();
Impl::g_serial_thread_team_data.disband_pool();
Kokkos::HostSpace space ;
space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
, Impl::g_serial_thread_team_data.scratch_bytes() );
Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
}
#if defined(KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
} // namespace Kokkos

View File

@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member exec ;
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
Member exec( *data );
// Loop until all queues are empty
while ( 0 < queue->m_ready_count ) {
@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}
if ( end != task ) {
// pop_task resulted in lock == task->m_next
// pop_ready_task resulted in lock == task->m_next
// In the executing state
(*task->m_apply)( task , & exec );
@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
using execution_space = Kokkos::Serial ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
using Member = Impl::HostThreadTeamMember< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member exec ;
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
Member exec( *data );
// Loop until no runnable task
@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
}
}

View File

@ -65,6 +65,7 @@ public:
using memory_space = Kokkos::HostSpace ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
static
void iff_single_thread_recursive_execute( queue_type * const );
@ -72,237 +73,19 @@ public:
static
void execute( queue_type * const );
template< typename FunctorType >
template< typename TaskType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::Serial
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
typename TaskType::function_type
get_function_pointer() { return TaskType::apply ; }
};
extern template class TaskQueue< Kokkos::Serial > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::Serial >
{
public:
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
};
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
{
typedef iType index_type;
const iType start ;
const iType end ;
enum {increment = 1};
//const TaskExec< Kokkos::Serial > & thread;
TaskExec< Kokkos::Serial > & thread;
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct
//( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
: start(0)
, end(arg_count)
, thread(arg_thread)
{}
KOKKOS_INLINE_FUNCTION
TeamThreadRangeBoundariesStruct
//( const TaskExec< Kokkos::Serial > & arg_thread
( TaskExec< Kokkos::Serial > & arg_thread
, const iType& arg_start
, const iType & arg_end
)
: start( arg_start )
, end( arg_end)
, thread( arg_thread )
{}
};
//----------------------------------------------------------------------------
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
{
typedef iType index_type;
const iType start ;
const iType end ;
enum {increment = 1};
TaskExec< Kokkos::Serial > & thread;
KOKKOS_INLINE_FUNCTION
ThreadVectorRangeBoundariesStruct
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
: start( 0 )
, end(arg_count)
, thread(arg_thread)
{}
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
// OMP version needs non-const TaskExec
template< typename iType >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
}
// OMP version needs non-const TaskExec
template< typename iType1, typename iType2 >
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
Impl::TaskExec< Kokkos::Serial > >
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
{
typedef typename std::common_type< iType1, iType2 >::type iType;
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
thread, iType(start), iType(end) );
}
// OMP version needs non-const TaskExec
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
ThreadVectorRange
( Impl::TaskExec< Kokkos::Serial > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i);
}
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i, result);
initialized_result = result;
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
lambda(i, result);
initialized_result = result;
}
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
initialized_result = ValueType();
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
initialized_result+=tmp;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
ValueType result = initialized_result;
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
#pragma ivdep
#endif
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
ValueType tmp = ValueType();
lambda(i,tmp);
join(result,tmp);
}
initialized_result = result;
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */

View File

@ -1,693 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_HPP
#define KOKKOS_SYNCHRONIC_HPP
#include <impl/Kokkos_Synchronic_Config.hpp>
#include <atomic>
#include <chrono>
#include <thread>
#include <functional>
#include <algorithm>
namespace Kokkos {
namespace Impl {
enum notify_hint {
notify_all,
notify_one,
notify_none
};
enum expect_hint {
expect_urgent,
expect_delay
};
namespace Details {
template <class S, class T>
bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
int i = 0;
for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
return true;
else
__synchronic_relax();
for(;i < attempts; ++i)
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
return true;
else
__synchronic_yield();
return false;
}
struct __exponential_backoff {
__exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
}
static inline void sleep_for(std::chrono::microseconds const& time) {
auto t = time.count();
if(__builtin_expect(t > 75,0)) {
portable_sleep(time);
}
else if(__builtin_expect(t > 25,0))
__synchronic_yield();
else
__synchronic_relax();
}
void sleep_for_step() {
sleep_for(step());
}
std::chrono::microseconds step() {
float const f = ranfu();
int const t = int(microseconds * f);
if(__builtin_expect(f >= 0.95f,0))
microseconds = 8;
else
microseconds = (std::min)(microseconds>>1,maximum);
return std::chrono::microseconds(t);
}
private :
int maximum, microseconds, x, y, z;
int xorshf96() {
int t;
x ^= x << 16; x ^= x >> 5; x ^= x << 1;
t = x; x = y; y = z; z = t ^ x ^ y;
return z;
}
float ranfu() {
return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
}
};
template <class T, class Enable = void>
struct __synchronic_base {
protected:
std::atomic<T> atom;
void notify(notify_hint = notify_all) noexcept {
}
void notify(notify_hint = notify_all) volatile noexcept {
}
public :
__synchronic_base() noexcept = default;
constexpr __synchronic_base(T v) noexcept : atom(v) { }
__synchronic_base(const __synchronic_base&) = delete;
~__synchronic_base() { }
__synchronic_base& operator=(const __synchronic_base&) = delete;
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
while(atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
}
}
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
while(atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
remains = then - std::chrono::high_resolution_clock::now();
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
return;
__exponential_backoff b;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
__do_backoff(b);
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
return;
remains = then - std::chrono::high_resolution_clock::now();
}
}
};
#ifdef __SYNCHRONIC_COMPATIBLE
template <class T>
struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
public:
std::atomic<T> atom;
void notify(notify_hint hint = notify_all) noexcept {
if(__builtin_expect(hint == notify_none,1))
return;
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
if(__builtin_expect(x,0)) {
if(__builtin_expect(hint == notify_all,1))
__synchronic_wake_all(&atom);
else
__synchronic_wake_one(&atom);
}
}
void notify(notify_hint hint = notify_all) volatile noexcept {
if(__builtin_expect(hint == notify_none,1))
return;
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
if(__builtin_expect(x,0)) {
if(__builtin_expect(hint == notify_all,1))
__synchronic_wake_all_volatile(&atom);
else
__synchronic_wake_one_volatile(&atom);
}
}
public :
__synchronic_base() noexcept : count(0) { }
constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
__synchronic_base(const __synchronic_base&) = delete;
~__synchronic_base() { }
__synchronic_base& operator=(const __synchronic_base&) = delete;
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait(&atom,val);
count.fetch_add(-1,std::memory_order_acquire);
}
}
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_volatile(&atom,val);
count.fetch_add(-1,std::memory_order_acquire);
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_timed(&atom,val,remains);
count.fetch_add(-1,std::memory_order_acquire);
remains = then - std::chrono::high_resolution_clock::now();
}
}
template <class Clock, class Duration>
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
return;
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
count.fetch_add(1,std::memory_order_release);
__synchronic_wait_timed_volatile(&atom,val,remains);
count.fetch_add(-1,std::memory_order_acquire);
remains = then - std::chrono::high_resolution_clock::now();
}
}
private:
mutable std::atomic<int> count;
};
#endif
template <class T, class Enable = void>
struct __synchronic : public __synchronic_base<T> {
__synchronic() noexcept = default;
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
};
template <class T>
struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_and(v,m);
this->notify(n);
return t;
}
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_and(v,m);
this->notify(n);
return t;
}
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_or(v,m);
this->notify(n);
return t;
}
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_or(v,m);
this->notify(n);
return t;
}
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_xor(v,m);
this->notify(n);
return t;
}
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_xor(v,m);
this->notify(n);
return t;
}
__synchronic() noexcept = default;
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
T operator=(T v) volatile noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T operator=(T v) noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T operator++(int) volatile noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T operator++(int) noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T operator--(int) volatile noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T operator--(int) noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T operator++() volatile noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T operator++() noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T operator--() volatile noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T operator--() noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T operator+=(T v) volatile noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T operator+=(T v) noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T operator-=(T v) volatile noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T operator-=(T v) noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T operator&=(T v) volatile noexcept {
auto const t = this->atom &= v;
this->notify();
return t;
}
T operator&=(T v) noexcept {
auto const t = this->atom &= v;
this->notify();
return t;
}
T operator|=(T v) volatile noexcept {
auto const t = this->atom |= v;
this->notify();
return t;
}
T operator|=(T v) noexcept {
auto const t = this->atom |= v;
this->notify();
return t;
}
T operator^=(T v) volatile noexcept {
auto const t = this->atom ^= v;
this->notify();
return t;
}
T operator^=(T v) noexcept {
auto const t = this->atom ^= v;
this->notify();
return t;
}
};
template <class T>
struct __synchronic<T*> : public __synchronic_base<T*> {
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_add(v,m);
this->notify(n);
return t;
}
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.fetch_sub(v,m);
this->notify(n);
return t;
}
__synchronic() noexcept = default;
constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
__synchronic(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) = delete;
__synchronic& operator=(const __synchronic&) volatile = delete;
T* operator=(T* v) volatile noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T* operator=(T* v) noexcept {
auto const t = this->atom = v;
this->notify();
return t;
}
T* operator++(int) volatile noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T* operator++(int) noexcept {
auto const t = ++this->atom;
this->notify();
return t;
}
T* operator--(int) volatile noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T* operator--(int) noexcept {
auto const t = --this->atom;
this->notify();
return t;
}
T* operator++() volatile noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T* operator++() noexcept {
auto const t = this->atom++;
this->notify();
return t;
}
T* operator--() volatile noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T* operator--() noexcept {
auto const t = this->atom--;
this->notify();
return t;
}
T* operator+=(ptrdiff_t v) volatile noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T* operator+=(ptrdiff_t v) noexcept {
auto const t = this->atom += v;
this->notify();
return t;
}
T* operator-=(ptrdiff_t v) volatile noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
T* operator-=(ptrdiff_t v) noexcept {
auto const t = this->atom -= v;
this->notify();
return t;
}
};
} //namespace Details
template <class T>
struct synchronic : public Details::__synchronic<T> {
bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
this->atom.store(v,m);
this->notify(n);
}
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
this->atom.store(v,m);
this->notify(n);
}
T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
operator T() const volatile noexcept { return (T)this->atom; }
operator T() const noexcept { return (T)this->atom; }
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.exchange(v,m);
this->notify(n);
return t;
}
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.exchange(v,m);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_weak(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m);
this->notify(n);
return t;
}
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
auto const t = this->atom.compare_exchange_strong(r,v,m);
this->notify(n);
return t;
}
synchronic() noexcept = default;
constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
synchronic(const synchronic&) = delete;
~synchronic() { }
synchronic& operator=(const synchronic&) = delete;
synchronic& operator=(const synchronic&) volatile = delete;
T operator=(T val) noexcept {
return Details::__synchronic<T>::operator=(val);
}
T operator=(T val) volatile noexcept {
return Details::__synchronic<T>::operator=(val);
}
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
Details::__synchronic<T>::expect_update(val,h);
return load(order);
}
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
Details::__synchronic<T>::expect_update(val,h);
return load(order);
}
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
Details::__synchronic<T>::expect_update(nval,h);
return load(order);
}
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
expect_update(nval,h);
return load(order);
}
template <class Rep, class Period>
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
}
template < class Rep, class Period>
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
}
};
#include <inttypes.h>
typedef synchronic<char> synchronic_char;
typedef synchronic<char> synchronic_schar;
typedef synchronic<unsigned char> synchronic_uchar;
typedef synchronic<short> synchronic_short;
typedef synchronic<unsigned short> synchronic_ushort;
typedef synchronic<int> synchronic_int;
typedef synchronic<unsigned int> synchronic_uint;
typedef synchronic<long> synchronic_long;
typedef synchronic<unsigned long> synchronic_ulong;
typedef synchronic<long long> synchronic_llong;
typedef synchronic<unsigned long long> synchronic_ullong;
//typedef synchronic<char16_t> synchronic_char16_t;
//typedef synchronic<char32_t> synchronic_char32_t;
typedef synchronic<wchar_t> synchronic_wchar_t;
typedef synchronic<int_least8_t> synchronic_int_least8_t;
typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
typedef synchronic<int_least16_t> synchronic_int_least16_t;
typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
typedef synchronic<int_least32_t> synchronic_int_least32_t;
typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
typedef synchronic<intptr_t> synchronic_intptr_t;
typedef synchronic<uintptr_t> synchronic_uintptr_t;
typedef synchronic<size_t> synchronic_size_t;
typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
typedef synchronic<intmax_t> synchronic_intmax_t;
typedef synchronic<uintmax_t> synchronic_uintmax_t;
}
}
#endif //__SYNCHRONIC_H

View File

@ -1,169 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
#define KOKKOS_SYNCHRONIC_CONFIG_H
#include <thread>
#include <chrono>
namespace Kokkos {
namespace Impl {
//the default yield function used inside the implementation is the Standard one
#define __synchronic_yield std::this_thread::yield
#define __synchronic_relax __synchronic_yield
#if defined(_MSC_VER)
//this is a handy GCC optimization that I use inside the implementation
#define __builtin_expect(condition,common) condition
#if _MSC_VER <= 1800
//using certain keywords that VC++ temporarily doesn't support
#define _ALLOW_KEYWORD_MACROS
#define noexcept
#define constexpr
#endif
//yes, I define multiple assignment operators
#pragma warning(disable:4522)
//I don't understand how Windows is so bad at timing functions, but is OK
//with straight-up yield loops
#define __do_backoff(b) __synchronic_yield()
#else
#define __do_backoff(b) b.sleep_for_step()
#endif
//certain platforms have efficient support for spin-waiting built into the operating system
#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
#if defined(_WIN32_WINNT)
#include <winsock2.h>
#include <Windows.h>
//the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
#define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
#define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
#define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
#define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
#define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
#define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
#define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
#define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
inline void native_sleep(unsigned long microseconds)
{
// What to do if microseconds is < 1000?
Sleep(microseconds / 1000);
}
inline void native_yield()
{
SwitchToThread();
}
#elif defined(__linux__)
#include <chrono>
#include <time.h>
#include <unistd.h>
#include <pthread.h>
#include <linux/futex.h>
#include <sys/syscall.h>
#include <climits>
#include <cassert>
template < class Rep, class Period>
inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
struct timespec ts;
ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
assert(!ts.tv_sec);
ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
return ts;
}
inline long futex(void const* addr1, int op, int val1) {
return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
}
inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
}
inline void native_sleep(unsigned long microseconds)
{
usleep(microseconds);
}
inline void native_yield()
{
pthread_yield();
}
//the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
#define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
#define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
#define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
#define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
#define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
#define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
#define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
#define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
//the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
#undef __synchronic_yield
#define __synchronic_yield sched_yield
//for extremely short wait times, just let another hyper-thread run
#undef __synchronic_relax
#define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
#endif
#endif
#ifdef _GLIBCXX_USE_NANOSLEEP
inline void portable_sleep(std::chrono::microseconds const& time)
{ std::this_thread::sleep_for(time); }
#else
inline void portable_sleep(std::chrono::microseconds const& time)
{ native_sleep(time.count()); }
#endif
#ifdef _GLIBCXX_USE_SCHED_YIELD
inline void portable_yield()
{ std::this_thread::yield(); }
#else
inline void portable_yield()
{ native_yield(); }
#endif
//this is the number of times we initially spin, on the first wait attempt
#define __SYNCHRONIC_SPIN_COUNT_A 16
//this is how decide to yield instead of just spinning, 'c' is the current trip count
//#define __SYNCHRONIC_SPIN_YIELD(c) true
#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
//this is the number of times we normally spin, on every subsequent wait attempt
#define __SYNCHRONIC_SPIN_COUNT_B 8
}
}
#endif //__SYNCHRONIC_CONFIG_H

View File

@ -1,162 +0,0 @@
/*
Copyright (c) 2014, NVIDIA Corporation
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
#define KOKKOS_SYNCHRONIC_N3998_HPP
#include <impl/Kokkos_Synchronic.hpp>
#include <functional>
/*
In the section below, a synchronization point represents a point at which a
thread may block until a given synchronization condition has been reached or
at which it may notify other threads that a synchronization condition has
been achieved.
*/
namespace Kokkos { namespace Impl {
/*
A latch maintains an internal counter that is initialized when the latch
is created. The synchronization condition is reached when the counter is
decremented to 0. Threads may block at a synchronization point waiting
for the condition to be reached. When the condition is reached, any such
blocked threads will be released.
*/
struct latch {
latch(int val) : count(val), released(false) { }
latch(const latch&) = delete;
latch& operator=(const latch&) = delete;
~latch( ) { }
void arrive( ) {
__arrive( );
}
void arrive_and_wait( ) {
if(!__arrive( ))
wait( );
}
void wait( ) {
while(!released.load_when_not_equal(false,std::memory_order_acquire))
;
}
bool try_wait( ) {
return released.load(std::memory_order_acquire);
}
private:
bool __arrive( ) {
if(count.fetch_add(-1,std::memory_order_release)!=1)
return false;
released.store(true,std::memory_order_release);
return true;
}
std::atomic<int> count;
synchronic<bool> released;
};
/*
A barrier is created with an initial value representing the number of threads
that can arrive at the synchronization point. When that many threads have
arrived, the synchronization condition is reached and the threads are
released. The barrier will then reset, and may be reused for a new cycle, in
which the same set of threads may arrive again at the synchronization point.
The same set of threads shall arrive at the barrier in each cycle, otherwise
the behaviour is undefined.
*/
struct barrier {
barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
barrier(const barrier&) = delete;
barrier& operator=(const barrier&) = delete;
~barrier() { }
void arrive_and_wait() {
int const myepoch = epoch.load(std::memory_order_relaxed);
if(!__arrive(myepoch))
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
;
}
void arrive_and_drop() {
nexpected.fetch_add(-1,std::memory_order_relaxed);
__arrive(epoch.load(std::memory_order_relaxed));
}
private:
bool __arrive(int const myepoch) {
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
if(__builtin_expect(myresult == expected,0)) {
expected = nexpected.load(std::memory_order_relaxed);
arrived.store(0,std::memory_order_relaxed);
epoch.store(myepoch+1,std::memory_order_release);
return true;
}
return false;
}
int expected;
std::atomic<int> arrived, nexpected;
synchronic<int> epoch;
};
/*
A notifying barrier behaves as a barrier, but is constructed with a callable
completion function that is invoked after all threads have arrived at the
synchronization point, and before the synchronization condition is reached.
The completion may modify the set of threads that arrives at the barrier in
each cycle.
*/
struct notifying_barrier {
template <typename T>
notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
notifying_barrier(const notifying_barrier&) = delete;
notifying_barrier& operator=(const notifying_barrier&) = delete;
~notifying_barrier( ) { }
void arrive_and_wait() {
int const myepoch = epoch.load(std::memory_order_relaxed);
if(!__arrive(myepoch))
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
;
}
void arrive_and_drop() {
nexpected.fetch_add(-1,std::memory_order_relaxed);
__arrive(epoch.load(std::memory_order_relaxed));
}
private:
bool __arrive(int const myepoch) {
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
if(__builtin_expect(myresult == expected,0)) {
int const newexpected = completion();
expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
arrived.store(0,std::memory_order_relaxed);
epoch.store(myepoch+1,std::memory_order_release);
return true;
}
return false;
}
int expected;
std::atomic<int> arrived, nexpected;
synchronic<int> epoch;
std::function<int()> completion;
};
}}
#endif //__N3998_H

View File

@ -76,9 +76,6 @@ namespace Impl {
template< typename Space , typename ResultType , typename FunctorType >
class TaskBase ;
template< typename Space >
class TaskExec ;
} /* namespace Impl */
} /* namespace Kokkos */
@ -149,8 +146,8 @@ private:
// task->m_next is the dependence or zero
// Postcondition:
// task->m_next is linked list membership
KOKKOS_FUNCTION
void schedule( task_root_type * const );
KOKKOS_FUNCTION void schedule_runnable( task_root_type * const );
KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
// Reschedule a task
// Precondition:
@ -178,7 +175,7 @@ private:
, task_root_type * const );
KOKKOS_FUNCTION
static task_root_type * pop_task( task_root_type * volatile * const );
static task_root_type * pop_ready_task( task_root_type * volatile * const );
KOKKOS_FUNCTION static
void decrement( task_root_type * task );
@ -368,6 +365,7 @@ public:
int16_t m_task_type ; ///< Type of task
int16_t m_priority ; ///< Priority of runnable task
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
@ -375,17 +373,43 @@ public:
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for a runnable task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase() noexcept
: m_apply(0)
, m_queue(0)
, m_wait(0)
, m_next(0)
, m_ref_count(0)
, m_alloc_size(0)
, m_dep_count(0)
, m_task_type( TaskSingle )
, m_priority( 1 /* TaskRegularPriority */ )
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, TaskBase * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
) noexcept
: m_apply( arg_apply )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( arg_dependence )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( 0 )
, m_task_type( arg_task_type )
, m_priority( arg_priority )
{}
// Constructor for an aggregate task
KOKKOS_INLINE_FUNCTION
constexpr TaskBase( queue_type * arg_queue
, int arg_ref_count
, int arg_alloc_size
, int arg_dep_count
) noexcept
: m_apply( 0 )
, m_queue( arg_queue )
, m_wait( 0 )
, m_next( 0 )
, m_ref_count( arg_ref_count )
, m_alloc_size( arg_alloc_size )
, m_dep_count( arg_dep_count )
, m_task_type( Aggregate )
, m_priority( 0 )
{}
//----------------------------------------
@ -406,9 +430,13 @@ public:
KOKKOS_INLINE_FUNCTION
void add_dependence( TaskBase* dep )
{
// Precondition: lock == m_next
TaskBase * const lock = (TaskBase *) LockTag ;
// Assign dependence to m_next. It will be processed in the subsequent
// call to schedule. Error if the dependence is reset.
if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
}
@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
{
private:
static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
using root_type = TaskBase<ExecSpace,void,void> ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
static_assert( sizeof(root_type) == 48 , "" );
TaskBase() = delete ;
TaskBase( TaskBase && ) = delete ;
TaskBase( const TaskBase & ) = delete ;
TaskBase & operator = ( TaskBase && ) = delete ;
@ -444,9 +477,24 @@ public:
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
TaskBase()
: TaskBase< ExecSpace , void , void >()
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
)
: root_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, m_result()
{}
@ -473,7 +521,10 @@ public:
using root_type = TaskBase< ExecSpace , void , void > ;
using base_type = TaskBase< ExecSpace , ResultType , void > ;
using member_type = TaskExec< ExecSpace > ;
using specialization = TaskQueueSpecialization< ExecSpace > ;
using function_type = typename root_type::function_type ;
using queue_type = typename root_type::queue_type ;
using member_type = typename specialization::member_type ;
using functor_type = FunctorType ;
using result_type = ResultType ;
@ -522,13 +573,30 @@ public:
if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
// Did not respawn, destroy the functor to free memory.
static_cast<functor_type*>(task)->~functor_type();
// Cannot destroy the task until its dependences have been processed.
// Cannot destroy and deallocate the task until its dependences
// have been processed.
}
}
// Constructor for runnable task
KOKKOS_INLINE_FUNCTION
TaskBase( functor_type const & arg_functor )
: base_type()
constexpr TaskBase( function_type arg_apply
, queue_type * arg_queue
, root_type * arg_dependence
, int arg_ref_count
, int arg_alloc_size
, int arg_task_type
, int arg_priority
, FunctorType && arg_functor
)
: base_type( arg_apply
, arg_queue
, arg_dependence
, arg_ref_count
, arg_alloc_size
, arg_task_type
, arg_priority
)
, functor_type( arg_functor )
{}

View File

@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
)
{
// Push task into a concurrently pushed and popped queue.
// The queue can be either a ready task queue or a waiting task queue.
// The queue is a linked list where 'task->m_next' form the links.
// Fail the push attempt if the queue is locked;
// otherwise retry until the push succeeds.
@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
template< typename ExecSpace >
KOKKOS_FUNCTION
typename TaskQueue< ExecSpace >::task_root_type *
TaskQueue< ExecSpace >::pop_task
TaskQueue< ExecSpace >::pop_ready_task
( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
{
// Pop task from a concurrently pushed and popped queue.
// Pop task from a concurrently pushed and popped ready task queue.
// The queue is a linked list where 'task->m_next' form the links.
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
@ -252,42 +252,34 @@ TaskQueue< ExecSpace >::pop_task
// (1) lock, (2) end, or (3) a valid task.
// Thus zero will never appear in the queue.
//
// If queue is locked then just read by guaranteeing
// the CAS will fail.
// If queue is locked then just read by guaranteeing the CAS will fail.
if ( lock == task ) task = 0 ;
task_root_type * const x = task ;
task = Kokkos::atomic_compare_exchange(queue,task,lock);
if ( x == task ) break ; // CAS succeeded and queue is locked
}
if ( end != task ) {
task = Kokkos::atomic_compare_exchange(queue,x,lock);
if ( x == task ) {
// CAS succeeded and queue is locked
//
// This thread has locked the queue and removed 'task' from the queue.
// Extract the next entry of the queue from 'task->m_next'
// and mark 'task' as popped from a queue by setting
// 'task->m_next = lock'.
task_root_type * const next =
Kokkos::atomic_exchange( & task->m_next , lock );
//
// Place the next entry in the head of the queue,
// which also unlocks the queue.
//
// This thread has exclusive access to
// the queue and the popped task's m_next.
task_root_type * const unlock =
Kokkos::atomic_exchange( queue , next );
*queue = task->m_next ; task->m_next = lock ;
if ( next == zero || next == lock || lock != unlock ) {
Kokkos::abort("TaskQueue::pop_task ERROR");
}
}
Kokkos::memory_fence();
#if 0
if ( end != task ) {
printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
, uintptr_t(queue)
, uintptr_t(task)
, uintptr_t(task->m_wait)
@ -295,42 +287,166 @@ TaskQueue< ExecSpace >::pop_task
, int(task->m_task_type)
, int(task->m_priority)
, int(task->m_ref_count) );
}
#endif
return task ;
}
}
return end ;
}
//----------------------------------------------------------------------------
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::schedule
void TaskQueue< ExecSpace >::schedule_runnable
( TaskQueue< ExecSpace >::task_root_type * const task )
{
// Schedule a runnable or when_all task upon construction / spawn
// Schedule a runnable task upon construction / spawn
// and upon completion of other tasks that 'task' is waiting on.
// Precondition on runnable task state:
// task is either constructing or executing
//
// Precondition:
// - called by a single thread for the input task
// - calling thread has exclusive access to the task
// - task is not a member of a queue
// - if runnable then task is either constructing or respawning
//
// Constructing state:
// task->m_wait == 0
// task->m_next == dependence
// Executing-respawn state:
// task->m_wait == head of linked list
// task->m_next == dependence
// task->m_next == dependence or 0
// Respawn state:
// task->m_wait == head of linked list: 'end' or valid task
// task->m_next == dependence or 0
//
// Task state transition:
// Constructing -> Waiting
// Executing-respawn -> Waiting
// Respawn -> Waiting
//
// Postcondition on task state:
// task->m_wait == head of linked list
// task->m_next == member of linked list
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
, task->m_task_type
, task->m_priority
, task->m_ref_count );
#endif
task_root_type * const zero = (task_root_type *) 0 ;
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
bool respawn = false ;
//----------------------------------------
if ( zero == task->m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
task->m_wait = end ;
// Task in Waiting state
}
else if ( lock != task->m_wait ) {
// Task in Executing state with Respawn request
// - Update dependence
// - Transition to Waiting state
respawn = true ;
}
else {
// Task in Complete state
Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
}
//----------------------------------------
// Scheduling a runnable task which may have a depencency 'dep'.
// Extract dependence, if any, from task->m_next.
// If 'dep' is not null then attempt to push 'task'
// into the wait queue of 'dep'.
// If the push succeeds then 'task' may be
// processed or executed by another thread at any time.
// If the push fails then 'dep' is complete and 'task'
// is ready to execute.
// Exclusive access so don't need an atomic exchange
// task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
task_root_type * dep = task->m_next ; task->m_next = zero ;
const bool is_ready =
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
if ( ( 0 != dep ) && respawn ) {
// Reference count for dep was incremented when
// respawn assigned dependency to task->m_next
// so that if dep completed prior to the
// above push_task dep would not be destroyed.
// dep reference count can now be decremented,
// which may deallocate the task.
TaskQueue::assign( & dep , (task_root_type *)0 );
}
if ( is_ready ) {
// No dependence or 'dep' is complete so push task into ready queue.
// Increment the ready count before pushing into ready queue
// to track number of ready + executing tasks.
// The ready count will be decremented when the task is complete.
Kokkos::atomic_increment( & m_ready_count );
task_root_type * volatile * const ready_queue =
& m_ready[ task->m_priority ][ task->m_task_type ];
// A push_task fails if the ready queue is locked.
// A ready queue is only locked during a push or pop;
// i.e., it is never permanently locked.
// Retry push to ready queue until it succeeds.
// When the push succeeds then 'task' may be
// processed or executed by another thread at any time.
while ( ! push_task( ready_queue , task ) );
}
//----------------------------------------
// Postcondition:
// - A runnable 'task' was pushed into a wait or ready queue.
// - Concurrent execution may have already popped 'task'
// from a queue and processed it as appropriate.
}
template< typename ExecSpace >
KOKKOS_FUNCTION
void TaskQueue< ExecSpace >::schedule_aggregate
( TaskQueue< ExecSpace >::task_root_type * const task )
{
// Schedule an aggregate task upon construction
// and upon completion of other tasks that 'task' is waiting on.
//
// Precondition:
// - called by a single thread for the input task
// - calling thread has exclusive access to the task
// - task is not a member of a queue
//
// Constructing state:
// task->m_wait == 0
// task->m_next == dependence or 0
//
// Task state transition:
// Constructing -> Waiting
//
// Postcondition on task state:
// task->m_wait == head of linked list (queue)
// task->m_next == member of linked list (queue)
#if 0
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
, uintptr_t(task)
, uintptr_t(task->m_wait)
, uintptr_t(task->m_next)
@ -344,71 +460,22 @@ void TaskQueue< ExecSpace >::schedule
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
//----------------------------------------
{
// If Constructing then task->m_wait == 0
// Change to waiting by task->m_wait = EndTag
task_root_type * const init =
Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
if ( zero == task->m_wait ) {
// Task in Constructing state
// - Transition to Waiting state
// Preconditions:
// - call occurs exclusively within a single thread
// Precondition
if ( lock == init ) {
Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
task->m_wait = end ;
// Task in Waiting state
}
else if ( lock == task->m_wait ) {
// Task in Complete state
Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
}
// if ( init == 0 ) Constructing -> Waiting
// else Executing-Respawn -> Waiting
}
//----------------------------------------
if ( task_root_type::Aggregate != task->m_task_type ) {
// Scheduling a runnable task which may have a depencency 'dep'.
// Extract dependence, if any, from task->m_next.
// If 'dep' is not null then attempt to push 'task'
// into the wait queue of 'dep'.
// If the push succeeds then 'task' may be
// processed or executed by another thread at any time.
// If the push fails then 'dep' is complete and 'task'
// is ready to execute.
task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
const bool is_ready =
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
// Reference count for dep was incremented when assigned
// to task->m_next so that if it completed prior to the
// above push_task dep would not be destroyed.
// dep reference count can now be decremented,
// which may deallocate the task.
TaskQueue::assign( & dep , (task_root_type *)0 );
if ( is_ready ) {
// No dependence or 'dep' is complete so push task into ready queue.
// Increment the ready count before pushing into ready queue
// to track number of ready + executing tasks.
// The ready count will be decremented when the task is complete.
Kokkos::atomic_increment( & m_ready_count );
task_root_type * volatile * const queue =
& m_ready[ task->m_priority ][ task->m_task_type ];
// A push_task fails if the ready queue is locked.
// A ready queue is only locked during a push or pop;
// i.e., it is never permanently locked.
// Retry push to ready queue until it succeeds.
// When the push succeeds then 'task' may be
// processed or executed by another thread at any time.
while ( ! push_task( queue , task ) );
}
}
//----------------------------------------
else {
// Scheduling a 'when_all' task with multiple dependences.
// This scheduling may be called when the 'when_all' is
// (1) created or
@ -432,7 +499,9 @@ void TaskQueue< ExecSpace >::schedule
// The reference count of 'x' was incremented when
// it was assigned into the dependence list.
task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
// Exclusive access so don't need an atomic exchange
// task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
task_root_type * x = aggr[i] ; aggr[i] = zero ;
if ( x ) {
@ -464,13 +533,11 @@ void TaskQueue< ExecSpace >::schedule
// '*task' may have been deleted upon completion
}
}
//----------------------------------------
// Postcondition:
// A runnable 'task' was pushed into a wait or ready queue.
// An aggregate 'task' was either pushed to a wait queue
// or completed.
// Concurrent execution may have already popped 'task'
// - An aggregate 'task' was either pushed to a wait queue or completed.
// - Concurrent execution may have already popped 'task'
// from a queue and processed it as appropriate.
}
@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
// Is a runnable task has finished executing and requested respawn.
// Schedule the task for subsequent execution.
schedule( task );
schedule_runnable( task );
}
//----------------------------------------
else {
@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
TaskQueue::assign( & task , zero );
// This thread has exclusive access to the wait list so
// the concurrency-safe pop_task function is not needed.
// the concurrency-safe pop_ready_task function is not needed.
// Schedule the tasks that have been waiting on the input 'task',
// which may have been deleted.
while ( x != end ) {
// Have exclusive access to 'x' until it is scheduled
// Set x->m_next = zero <= no dependence, not a respawn
// Set x->m_next = zero <= no dependence
task_root_type * const next = x->m_next ; x->m_next = 0 ;
task_root_type * const next =
(task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
schedule( x );
if ( task_root_type::Aggregate != x->m_task_type ) {
schedule_runnable( x );
}
else {
schedule_aggregate( x );
}
x = next ;
}

View File

@ -45,6 +45,7 @@
#define KOKKOS_CORE_IMPL_UTILITIES_HPP
#include <Kokkos_Macros.hpp>
#include <stdint.h>
#include <type_traits>
//----------------------------------------------------------------------------

View File

@ -42,46 +42,138 @@
*/
#include <Kokkos_Macros.hpp>
#include <impl/Kokkos_spinwait.hpp>
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_BitOps.hpp>
/*--------------------------------------------------------------------------*/
#if ( KOKKOS_ENABLE_ASM )
#if !defined( _WIN32 )
#if defined( KOKKOS_ENABLE_ASM )
#if defined( __arm__ ) || defined( __aarch64__ )
/* No-operation instruction to idle the thread. */
#define YIELD asm volatile("nop")
#define KOKKOS_INTERNAL_PAUSE
#else
/* Pause instruction to prevent excess processor bus usage */
#define YIELD asm volatile("pause\n":::"memory")
#define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory")
#endif
#elif defined ( KOKKOS_ENABLE_WINTHREAD )
#define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0u: KOKKOS_INTERNAL_NOP2; break;
case 1u: KOKKOS_INTERNAL_NOP4; break;
case 2u: KOKKOS_INTERNAL_NOP8; break;
case 3u: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#else
#include <sched.h>
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
sched_yield();
}
}
#endif
#else // defined( _WIN32 )
#if defined ( KOKKOS_ENABLE_WINTHREAD )
#include <process.h>
#define YIELD Sleep(0)
#elif defined ( _WIN32) && defined (_MSC_VER)
/* Windows w/ Visual Studio */
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
Sleep(0);
}
}
#elif defined( _MSC_VER )
#define NOMINMAX
#include <winsock2.h>
#include <windows.h>
#define YIELD YieldProcessor();
#elif defined ( _WIN32 )
/* Windows w/ Intel*/
#define YIELD __asm__ __volatile__("pause\n":::"memory")
#else
#include <sched.h>
#define YIELD sched_yield()
namespace {
inline void kokkos_internal_yield( const unsigned ) noexcept {
YieldProcessor();
}
}
#else
#define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory")
#define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop")
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
namespace {
inline void kokkos_internal_yield( const unsigned i ) noexcept {
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
case 0: KOKKOS_INTERNAL_NOP2; break;
case 1: KOKKOS_INTERNAL_NOP4; break;
case 2: KOKKOS_INTERNAL_NOP8; break;
case 3: KOKKOS_INTERNAL_NOP16; break;
default: KOKKOS_INTERNAL_NOP32;
}
KOKKOS_INTERNAL_PAUSE;
}
}
#endif
#endif
/*--------------------------------------------------------------------------*/
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait( volatile int & flag , const int value )
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
YIELD ;
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value == flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
{
Kokkos::store_fence();
unsigned i = 0;
while ( value != flag ) {
kokkos_internal_yield(i);
++i;
}
Kokkos::load_fence();
}
#endif
} /* namespace Impl */

View File

@ -47,14 +47,30 @@
#include <Kokkos_Macros.hpp>
#include <cstdint>
namespace Kokkos {
namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void spinwait( volatile int & flag , const int value );
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
#else
KOKKOS_INLINE_FUNCTION
void spinwait( volatile int & , const int ) {}
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
KOKKOS_INLINE_FUNCTION
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
#endif
} /* namespace Impl */

View File

@ -115,10 +115,31 @@ IF(Kokkos_ENABLE_OpenMP)
)
ENDIF()
IF(Kokkos_ENABLE_QTHREAD)
IF(Kokkos_ENABLE_Qthreads)
TRIBITS_ADD_EXECUTABLE_AND_TEST(
UnitTest_Qthread
SOURCES UnitTestMain.cpp TestQthread.cpp
UnitTest_Qthreads
SOURCES
UnitTestMain.cpp
qthreads/TestQthreads_Atomics.cpp
qthreads/TestQthreads_Other.cpp
qthreads/TestQthreads_Reductions.cpp
qthreads/TestQthreads_SubView_a.cpp
qthreads/TestQthreads_SubView_b.cpp
qthreads/TestQthreads_SubView_c01.cpp
qthreads/TestQthreads_SubView_c02.cpp
qthreads/TestQthreads_SubView_c03.cpp
qthreads/TestQthreads_SubView_c04.cpp
qthreads/TestQthreads_SubView_c05.cpp
qthreads/TestQthreads_SubView_c06.cpp
qthreads/TestQthreads_SubView_c07.cpp
qthreads/TestQthreads_SubView_c08.cpp
qthreads/TestQthreads_SubView_c09.cpp
qthreads/TestQthreads_SubView_c10.cpp
qthreads/TestQthreads_SubView_c11.cpp
qthreads/TestQthreads_SubView_c12.cpp
qthreads/TestQthreads_Team.cpp
qthreads/TestQthreads_ViewAPI_a.cpp
qthreads/TestQthreads_ViewAPI_b.cpp
COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
@ -194,4 +215,3 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest
)

View File

@ -6,6 +6,7 @@ vpath %.cpp ${KOKKOS_PATH}/core/unit_test
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda
TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
@ -78,6 +79,22 @@ endif
TEST_TARGETS += test-openmp
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
OBJ_QTHREADS += TestQthreads_SubView_c_all.o
else
OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
endif
OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
TARGETS += KokkosCore_UnitTest_Qthreads
TEST_TARGETS += test-qthreads
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o
OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
@ -94,12 +111,6 @@ endif
TEST_TARGETS += test-serial
endif
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
TARGETS += KokkosCore_UnitTest_Qthread
TEST_TARGETS += test-qthread
endif
OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
TARGETS += KokkosCore_UnitTest_HWLOC
TEST_TARGETS += test-hwloc
@ -115,10 +126,6 @@ TARGETS += ${INITTESTS_TARGETS}
INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
TARGETS += KokkosCore_UnitTest_Synchronic
TEST_TARGETS += test-synchronic
KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
@ -131,8 +138,8 @@ KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthreads
KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
@ -146,9 +153,6 @@ KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
test-cuda: KokkosCore_UnitTest_Cuda
./KokkosCore_UnitTest_Cuda
@ -161,8 +165,8 @@ test-openmp: KokkosCore_UnitTest_OpenMP
test-serial: KokkosCore_UnitTest_Serial
./KokkosCore_UnitTest_Serial
test-qthread: KokkosCore_UnitTest_Qthread
./KokkosCore_UnitTest_Qthread
test-qthreads: KokkosCore_UnitTest_Qthreads
./KokkosCore_UnitTest_Qthreads
test-hwloc: KokkosCore_UnitTest_HWLOC
./KokkosCore_UnitTest_HWLOC
@ -176,9 +180,6 @@ test-default: KokkosCore_UnitTest_Default
${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
test-synchronic: KokkosCore_UnitTest_Synchronic
./KokkosCore_UnitTest_Synchronic
build_all: $(TARGETS)
test: $(TEST_TARGETS)
@ -193,4 +194,3 @@ clean: kokkos-clean
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc

Some files were not shown because too many files have changed in this diff Show More